]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraperseuscommand.cpp
working on chimera.uchime change for dereplicate=t bug. added shared file to get...
[mothur.git] / chimeraperseuscommand.cpp
index e57c48a7e168cb0b1afa719e425be7c423b5aa62..66889f145af3340d5a7fe38db4e8aa34997c2219 100644 (file)
 //**********************************************************************************************************************
 vector<string> ChimeraPerseusCommand::setParameters(){ 
        try {
-               CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
-               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none",false,false); parameters.push_back(pname);
-        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none",false,false); parameters.push_back(pcount);
-               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
-               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
-               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
-               CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "",false,false); parameters.push_back(pcutoff);
-               CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "",false,false); parameters.push_back(palpha);
-               CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "",false,false); parameters.push_back(pbeta);
+               CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","chimera-accnos",false,true,true); parameters.push_back(pfasta);
+               CommandParameter pname("name", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
+               CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
+               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+        CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups);
+
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
+               CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pcutoff);
+               CommandParameter palpha("alpha", "Number", "", "-5.54", "", "", "","",false,false); parameters.push_back(palpha);
+               CommandParameter pbeta("beta", "Number", "", "0.33", "", "", "","",false,false); parameters.push_back(pbeta);
                        
                vector<string> myArray;
                for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
@@ -40,13 +42,14 @@ string ChimeraPerseusCommand::getHelpString(){
        try {
                string helpString = "";
                helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n";
-               helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
+               helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n";
                helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n";
         helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n";
                helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
+        helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
                helpString += "The alpha parameter ....  The default is -5.54. \n";
                helpString += "The beta parameter ....  The default is 0.33. \n";
                helpString += "The cutoff parameter ....  The default is 0.50. \n";
@@ -62,25 +65,20 @@ string ChimeraPerseusCommand::getHelpString(){
        }
 }
 //**********************************************************************************************************************
-string ChimeraPerseusCommand::getOutputFileNameTag(string type, string inputName=""){  
-       try {
-        string outputFileName = "";
-               map<string, vector<string> >::iterator it;
+string ChimeraPerseusCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
         
-        //is this a type this command creates
-        it = outputTypes.find(type);
-        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
-        else {
-            if (type == "chimera") {  outputFileName =  "perseus.chimeras"; }
-            else if (type == "accnos") {  outputFileName =  "perseus.accnos"; }
-            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
-        }
-        return outputFileName;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ChimeraPerseusCommand", "getOutputFileNameTag");
-               exit(1);
-       }
+        if (type == "chimera") {  pattern = "[filename],perseus.chimeras"; } 
+        else if (type == "accnos") {  pattern = "[filename],perseus.accnos"; } 
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ChimeraPerseusCommand", "getOutputPattern");
+        exit(1);
+    }
 }
 //**********************************************************************************************************************
 ChimeraPerseusCommand::ChimeraPerseusCommand(){        
@@ -295,7 +293,7 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                                        bool ignore = false;
                                        if (countfileNames[i] == "current") { 
                                                countfileNames[i] = m->getCountTableFile(); 
-                                               if (nameFileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
+                                               if (countfileNames[i] != "") {  m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); }
                                                else {  
                                                        m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; 
                                                        //erase from file list
@@ -466,6 +464,13 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option)  {
                        
                        temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.33";  }
                        m->mothurConvert(temp, beta);
+            
+                       temp = validParameter.validFile(parameters, "dereplicate", false);      
+                       if (temp == "not found") { 
+                               if (groupfile != "")    {  temp = "false";                                      }
+                               else                    {  temp = "true";       }
+                       }
+                       dups = m->isTrue(temp);
                }
        }
        catch(exception& e) {
@@ -486,9 +491,11 @@ int ChimeraPerseusCommand::execute(){
                        m->mothurOut("Checking sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine();
                        
                        int start = time(NULL); 
-                       if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]);  }//if user entered a file with a path then preserve it                               
-                       string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("chimera");
-                       string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + getOutputFileNameTag("accnos");
+                       if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]);  }//if user entered a file with a path then preserve it       
+                       map<string, string> variables;
+                       variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s]));
+                       string outputFileName = getOutputFileName("chimera", variables);
+                       string accnosFileName = getOutputFileName("accnos", variables);
 
                        //string newFasta = m->getRootName(fastaFileNames[s]) + "temp";
                        
@@ -527,7 +534,9 @@ int ChimeraPerseusCommand::execute(){
                     
                     if (m->control_pressed) {  delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                     map<string, string> uniqueNames = cparser->getAllSeqsMap();
-                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    if (!dups) { 
+                        numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    }
                     delete cparser;
 
                     m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
@@ -563,7 +572,9 @@ int ChimeraPerseusCommand::execute(){
                     
                     if (m->control_pressed) {  delete parser; for (int j = 0; j < outputNames.size(); j++) {   m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                     map<string, string> uniqueNames = parser->getAllSeqsMap();
-                    numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    if (!dups) { 
+                        numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName);
+                    }
                     delete parser;
                     
                     m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
@@ -695,6 +706,7 @@ vector<seqData> ChimeraPerseusCommand::loadSequences(string group){
                 it = counts.find(thisGroupsSeqs[i].getName());
                 if (it == counts.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your count file, please correct."); m->mothurOutEndLine(); }
                 else {
+                    thisGroupsSeqs[i].setAligned(removeNs(thisGroupsSeqs[i].getUnaligned()));
                     sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), it->second));
                     if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
                 }
@@ -712,6 +724,7 @@ vector<seqData> ChimeraPerseusCommand::loadSequences(string group){
                 if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + thisGroupsSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
                 else {
                     int num = m->getNumNames(it->second);
+                    thisGroupsSeqs[i].setAligned(removeNs(thisGroupsSeqs[i].getUnaligned()));
                     sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
                     if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
                 }
@@ -753,6 +766,7 @@ vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
                        it = nameMap.find(temp.getName());
                        if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
                        else {
+                temp.setAligned(removeNs(temp.getUnaligned()));
                                sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
                 if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
                        }
@@ -772,6 +786,20 @@ vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, string name){
        }
 }
 //**********************************************************************************************************************
+string ChimeraPerseusCommand::removeNs(string seq){
+       try {
+        string newSeq = "";
+        for (int i = 0; i < seq.length(); i++) {
+            if (seq[i] != 'N') {  newSeq += seq[i]; }
+        }
+        return newSeq;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraPerseusCommand", "removeNs");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
 vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, CountTable* ct){
        try {           
                //read fasta file and create sequenceData structure - checking for file mismatches
@@ -786,6 +814,7 @@ vector<seqData> ChimeraPerseusCommand::readFiles(string inputFile, CountTable* c
                        int count = ct->getNumSeqs(temp.getName());
                        if (m->control_pressed) { break; }
                        else {
+                temp.setAligned(removeNs(temp.getUnaligned()));
                                sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), count));
                 if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
                        }
@@ -896,8 +925,8 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                                        type = "chimera";
                                        chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
                                }
-                               ;
-                               if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
+
+                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
                                
                                double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
                                
@@ -1047,6 +1076,9 @@ int ChimeraPerseusCommand::createProcessesGroups(string outputFName, string accn
                        
                //Close all thread handles and free memory allocations.
                for(int i=0; i < pDataArray.size(); i++){
+            if (pDataArray[i]->count != pDataArray[i]->end) {
+                m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
+            }
                        num += pDataArray[i]->count;
                        CloseHandle(hThreadArray[i]);
                        delete pDataArray[i];