]> git.donarmstrong.com Git - mothur.git/blobdiff - shhhercommand.cpp
mods to shhh.seqs and trim.flows to fix eye candy output
[mothur.git] / shhhercommand.cpp
index 6833f5d4c1cae8f4ee0e3b6f3da3bd2474b31285..f7c3e59714f4109398f97fb67d8dcb908a1ae9fd 100644 (file)
 #define MIN_WEIGHT 0.1
 #define MIN_TAU 0.0001
 #define MIN_ITER 10
-
 //**********************************************************************************************************************
-
-vector<string> ShhherCommand::getValidParameters(){    
+vector<string> ShhherCommand::setParameters(){ 
        try {
-               string Array[] =  {     
-                       "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors", "maxiter", "mindelta"        
-               };
+               CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
+               CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
+               CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
+               CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
+               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+               CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
+               CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
+               CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
+               CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                
-               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
                return myArray;
        }
        catch(exception& e) {
-               m->errorOut(e, "ShhherCommand", "getValidParameters");
+               m->errorOut(e, "ShhherCommand", "setParameters");
                exit(1);
        }
 }
-
 //**********************************************************************************************************************
-
-ShhherCommand::ShhherCommand(){        
+string ShhherCommand::getHelpString(){ 
        try {
-               abort = true; calledHelp = true;
-               
-               //initialize outputTypes
-               vector<string> tempOutNames;
-               outputTypes["pn.dist"] = tempOutNames;
-
+               string helpString = "";
+               helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
+               return helpString;
        }
        catch(exception& e) {
-               m->errorOut(e, "ShhherCommand", "ShhherCommand");
+               m->errorOut(e, "ShhherCommand", "getHelpString");
                exit(1);
        }
 }
-
 //**********************************************************************************************************************
 
-vector<string> ShhherCommand::getRequiredParameters(){ 
+ShhherCommand::ShhherCommand(){        
        try {
-               string Array[] =  {"flow"};
-               vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
-               return myArray;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ShhherCommand", "getRequiredParameters");
-               exit(1);
-       }
-}
-
-//**********************************************************************************************************************
+               abort = true; calledHelp = true;
+               setParameters();
+               
+               //initialize outputTypes
+//             vector<string> tempOutNames;
+//             outputTypes["pn.dist"] = tempOutNames;
 
-vector<string> ShhherCommand::getRequiredFiles(){      
-       try {
-               vector<string> myArray;
-               return myArray;
        }
        catch(exception& e) {
-               m->errorOut(e, "ShhherCommand", "getRequiredFiles");
+               m->errorOut(e, "ShhherCommand", "ShhherCommand");
                exit(1);
        }
 }
@@ -106,15 +98,10 @@ ShhherCommand::ShhherCommand(string option) {
                
                //allow user to run help
                if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
                
                else {
-                       
-                       //valid paramters for this command
-                       string AlignArray[] =  {
-                               "file", "flow", "lookup", "cutoff", "sigma", "outputdir","inputdir", "processors", "maxiter", "mindelta"        
-                       };
-                       
-                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       vector<string> myArray = setParameters();
                        
                        OptionParser parser(option);
                        map<string,string> parameters = parser.getParameters();
@@ -129,7 +116,7 @@ ShhherCommand::ShhherCommand(string option) {
                        
                        //initialize outputTypes
                        vector<string> tempOutNames;
-                       outputTypes["pn.dist"] = tempOutNames;
+//                     outputTypes["pn.dist"] = tempOutNames;
                        //                      outputTypes["fasta"] = tempOutNames;
                        
                        //if the user changes the input directory command factory will send this info to us in the output parameter 
@@ -173,12 +160,20 @@ ShhherCommand::ShhherCommand(string option) {
                        }
                        else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
                        
-                       if(flowFileName != "not found"){        compositeFASTAFileName = "";    }
+                       if(flowFileName != "not found"){
+                               compositeFASTAFileName = "";    
+                               compositeNamesFileName = "";    
+                       }
                        else{
-                               compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta";
                                ofstream temp;
+
+                               compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
                                m->openOutputFile(compositeFASTAFileName, temp);
                                temp.close();
+                               
+                               compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
+                               m->openOutputFile(compositeNamesFileName, temp);
+                               temp.close();
                        }
                        
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
@@ -196,8 +191,9 @@ ShhherCommand::ShhherCommand(string option) {
                        else if(temp == "not open")     {       abort = true;                   } 
                        else                                            {       lookupFileName = temp;  }
                        
-                       temp = validParameter.validFile(parameters, "processors", false);if (temp == "not found"){      temp = "1";                     }
-                       convert(temp, processors); 
+                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
+                       m->setProcessors(temp);
+                       convert(temp, processors);
 
                        temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
                        convert(temp, cutoff); 
@@ -211,7 +207,12 @@ ShhherCommand::ShhherCommand(string option) {
                        temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
                        convert(temp, sigma); 
                        
-                       globaldata = GlobalData::getInstance();
+                       flowOrder = validParameter.validFile(parameters, "order", false);
+                       if (flowOrder == "not found"){ flowOrder = "TACG";              }
+                       else if(flowOrder.length() != 4){
+                               m->mothurOut("The value of the order option must be four bases long\n");
+                       }
+                       
                }
                        
 #ifdef USE_MPI
@@ -224,23 +225,6 @@ ShhherCommand::ShhherCommand(string option) {
                exit(1);
        }
 }
-
-//**********************************************************************************************************************
-
-ShhherCommand::~ShhherCommand(){}
-
-//**********************************************************************************************************************
-
-void ShhherCommand::help(){
-       try {
-               m->mothurOut("The shhher command reads a file containing flowgrams and creates a file of corrected sequences.\n");
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ShhherCommand", "help");
-               exit(1);
-       }
-}
-
 //**********************************************************************************************************************
 #ifdef USE_MPI
 int ShhherCommand::execute(){
@@ -250,9 +234,6 @@ int ShhherCommand::execute(){
                int tag = 1976;
                MPI_Status status; 
 
-               double begClock = clock();
-               unsigned long int begTime = time(NULL);
-               
                if(pid == 0){
 
                        for(int i=1;i<ncpus;i++){
@@ -288,9 +269,10 @@ int ShhherCommand::execute(){
                        }
                        
                        for(int i=0;i<numFiles;i++){
-                               flowFileName = flowFileVector[i];
+                               double begClock = clock();
+                               unsigned long int begTime = time(NULL);
 
-                               
+                               flowFileName = flowFileVector[i];
                                
                                m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
                                m->mothurOut("Reading flowgrams...\n");
@@ -334,6 +316,11 @@ int ShhherCommand::execute(){
                                string listFileName = cluster(distFileName, namesFileName);
 
                                getOTUData(listFileName);
+
+                               remove(distFileName.c_str());
+                               remove(namesFileName.c_str());
+                               remove(listFileName.c_str());
+                               
                                initPyroCluster();
 
                                for(int i=1;i<ncpus;i++){
@@ -370,7 +357,7 @@ int ShhherCommand::execute(){
                                                MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
                                                MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
                                        }
-                               
+                                       
                                        calcCentroidsDriver(0, numOTUsOnCPU);
                                        
                                        for(int i=1;i<ncpus;i++){
@@ -471,25 +458,23 @@ int ShhherCommand::execute(){
                                m->mothurOut("\nFinalizing...\n");
                                fill();
                                setOTUs();
+                               
                                vector<int> otuCounts(numOTUs, 0);
                                for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
                                calcCentroidsDriver(0, numOTUs);
+                               
                                writeQualities(otuCounts);
                                writeSequences(otuCounts);
                                writeNames(otuCounts);
                                writeClusters(otuCounts);
                                writeGroups();
                                
-                               remove(distFileName.c_str());
-                               remove(namesFileName.c_str());
-                               remove(listFileName.c_str());
                                                                 
                                m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
                        }
                }
                else{
                        int abort = 1;
-                       bool live = 1;
 
                        MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                        if(abort){      return 0;       }
@@ -499,6 +484,8 @@ int ShhherCommand::execute(){
 
                        for(int i=0;i<numFiles;i++){
                                //Now into the pyrodist part
+                               bool live = 1;
+
                                char fileName[1024];
                                MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
                                MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
@@ -554,12 +541,12 @@ int ShhherCommand::execute(){
                                int total;
 
                                while(live){
-
+                                       
                                        MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                        singleTau.assign(total, 0.0000);
                                        seqNumber.assign(total, 0);
                                        seqIndex.assign(total, 0);
-                                       
+
                                        MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
@@ -567,13 +554,12 @@ int ShhherCommand::execute(){
                                        MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
-                                       
+
                                        calcCentroidsDriver(startOTU, endOTU);
 
                                        MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
                                        MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
 
-                                       
                                        MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
                                        MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
@@ -592,8 +578,19 @@ int ShhherCommand::execute(){
                                }
                        }
                }               
-
                MPI_Barrier(MPI_COMM_WORLD);
+
+               
+               if(compositeFASTAFileName != ""){
+                       outputNames.push_back(compositeFASTAFileName);
+                       outputNames.push_back(compositeNamesFileName);
+               }
+
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+               
                return 0;
 
        }
@@ -634,7 +631,7 @@ string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
                
                m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
                
-               string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
+               string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
                if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
 
                ofstream distFile(fDistFileName.c_str());
@@ -696,6 +693,10 @@ int ShhherCommand::execute(){
                        string listFileName = cluster(distFileName, namesFileName);
                        getOTUData(listFileName);
                        
+                       remove(distFileName.c_str());
+                       remove(namesFileName.c_str());
+                       remove(listFileName.c_str());
+                       
                        initPyroCluster();
                        
                        double maxDelta = 0;
@@ -704,7 +705,6 @@ int ShhherCommand::execute(){
                        double begClock = clock();
                        unsigned long int begTime = time(NULL);
 
-                       
                        m->mothurOut("\nDenoising flowgrams...\n");
                        m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
                        
@@ -742,12 +742,19 @@ int ShhherCommand::execute(){
                        writeClusters(otuCounts);
                        writeGroups();
                        
-                       remove(distFileName.c_str());
-                       remove(namesFileName.c_str());
-                       remove(listFileName.c_str());
-                       
                        m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                }
+               
+               if(compositeFASTAFileName != ""){
+                       outputNames.push_back(compositeFASTAFileName);
+                       outputNames.push_back(compositeNamesFileName);
+               }
+
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
+               m->mothurOutEndLine();
+               
                return 0;
        }
        catch(exception& e) {
@@ -1039,7 +1046,7 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st
 
 string ShhherCommand::createDistFile(int processors){
        try{
-               string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
+               string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
                                
                unsigned long int begTime = time(NULL);
                double begClock = clock();
@@ -1127,7 +1134,7 @@ string ShhherCommand::createNamesFile(){
                        duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
                }
                
-               string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
+               string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
                
                ofstream nameFile;
                m->openOutputFile(nameFileName, nameFile);
@@ -1151,11 +1158,6 @@ string ShhherCommand::createNamesFile(){
 string ShhherCommand::cluster(string distFileName, string namesFileName){
        try {
                
-               
-               globaldata->setNameFile(namesFileName);
-               globaldata->setColumnFile(distFileName);
-               globaldata->setFormat("column");
-               
                ReadMatrix* read = new ReadColumnMatrix(distFileName);  
                read->setCutoff(cutoff);
                
@@ -1181,7 +1183,7 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){
                
                list->setLabel(toString(cutoff));
                
-               string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
+               string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
                ofstream listFile;
                m->openOutputFile(listFileName, listFile);
                list->print(listFile);
@@ -1912,7 +1914,7 @@ void ShhherCommand::setOTUs(){
 void ShhherCommand::writeQualities(vector<int> otuCounts){
        
        try {
-               string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
+               string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual";
 
                ofstream qualityFile;
                m->openOutputFile(qualityFileName, qualityFile);
@@ -2001,7 +2003,8 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
                        }
                }
                qualityFile.close();
-               
+               outputNames.push_back(qualityFileName);
+
        }
        catch(exception& e) {
                m->errorOut(e, "ShhherCommand", "writeQualities");
@@ -2013,9 +2016,8 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
 
 void ShhherCommand::writeSequences(vector<int> otuCounts){
        try {
-               string bases = "TACG";
                
-               string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
+               string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta";
                ofstream fastaFile;
                m->openOutputFile(fastaFileName, fastaFile);
                
@@ -2027,18 +2029,23 @@ void ShhherCommand::writeSequences(vector<int> otuCounts){
                        if(otuCounts[i] > 0){
                                fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
                                
-                               for(int j=8;j<numFlowCells;j++){
+                               string newSeq = "";
+                               
+                               for(int j=0;j<numFlowCells;j++){
                                        
-                                       char base = bases[j % 4];
+                                       char base = flowOrder[j % 4];
                                        for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
-                                               fastaFile << base;
+                                               newSeq += base;
                                        }
                                }
-                               fastaFile << endl;
+                               
+                               fastaFile << newSeq.substr(4) << endl;
                        }
                }
                fastaFile.close();
-               
+
+               outputNames.push_back(fastaFileName);
+
                if(compositeFASTAFileName != ""){
                        m->appendFiles(fastaFileName, compositeFASTAFileName);
                }
@@ -2053,7 +2060,7 @@ void ShhherCommand::writeSequences(vector<int> otuCounts){
 
 void ShhherCommand::writeNames(vector<int> otuCounts){
        try {
-               string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
+               string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
                ofstream nameFile;
                m->openOutputFile(nameFileName, nameFile);
                
@@ -2069,6 +2076,12 @@ void ShhherCommand::writeNames(vector<int> otuCounts){
                        }
                }
                nameFile.close();
+               outputNames.push_back(nameFileName);
+               
+               
+               if(compositeNamesFileName != ""){
+                       m->appendFiles(nameFileName, compositeNamesFileName);
+               }               
        }
        catch(exception& e) {
                m->errorOut(e, "ShhherCommand", "writeNames");
@@ -2081,7 +2094,7 @@ void ShhherCommand::writeNames(vector<int> otuCounts){
 void ShhherCommand::writeGroups(){
        try {
                string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
-               string groupFileName = fileRoot + ".pn.groups";
+               string groupFileName = fileRoot + ".shhh.groups";
                ofstream groupFile;
                m->openOutputFile(groupFileName, groupFile);
                
@@ -2089,6 +2102,8 @@ void ShhherCommand::writeGroups(){
                        groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
                }
                groupFile.close();
+               outputNames.push_back(groupFileName);
+
        }
        catch(exception& e) {
                m->errorOut(e, "ShhherCommand", "writeGroups");
@@ -2100,11 +2115,11 @@ void ShhherCommand::writeGroups(){
 
 void ShhherCommand::writeClusters(vector<int> otuCounts){
        try {
-               string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
+               string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts";
                ofstream otuCountsFile;
                m->openOutputFile(otuCountsFileName, otuCountsFile);
                
-               string bases = "TACG";
+               string bases = flowOrder;
                
                for(int i=0;i<numOTUs;i++){
                        //output the translated version of the centroid sequence for the otu
@@ -2124,20 +2139,25 @@ void ShhherCommand::writeClusters(vector<int> otuCounts){
                                        int sequence = aaI[i][j];
                                        otuCountsFile << seqNameVector[sequence] << '\t';
                                        
-                                       for(int k=8;k<lengths[sequence];k++){
+                                       string newSeq = "";
+                                       
+                                       for(int k=0;k<lengths[sequence];k++){
                                                char base = bases[k % 4];
                                                int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
-                                               
+                                                       
                                                for(int s=0;s<freq;s++){
-                                                       otuCountsFile << base;
+                                                       newSeq += base;
+                                                       //otuCountsFile << base;
                                                }
                                        }
-                                       otuCountsFile << endl;
+                                       otuCountsFile << newSeq.substr(4) << endl;
                                }
                                otuCountsFile << endl;
                        }
                }
                otuCountsFile.close();
+               outputNames.push_back(otuCountsFileName);
+
        }
        catch(exception& e) {
                m->errorOut(e, "ShhherCommand", "writeClusters");