]> git.donarmstrong.com Git - mothur.git/blobdiff - screenseqscommand.cpp
reworked the classifiers summary file added groupfile option to classify.seqs, added...
[mothur.git] / screenseqscommand.cpp
index ed71559fba582e8d5a696621ab943a4851579704..5b3d7d7604c4f0fe5460265fbd611e12dded4629 100644 (file)
 
 //***************************************************************************************************************
 
-ScreenSeqsCommand::ScreenSeqsCommand(){
+ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
        try {
-               globaldata = GlobalData::getInstance();
-               if(globaldata->getFastaFile() == "")            {       cout << "you must provide a fasta formatted file" << endl;      }
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       //valid paramters for this command
+                       string AlignArray[] =  {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
+                                                                       "name", "group", "alignreport","outputdir","inputdir"};
+                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string,string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string,string>::iterator it;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("fasta");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                               
+                               it = parameters.find("alignreport");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["alignreport"] = inputDir + it->second;              }
+                               }
+                       }
+
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
+                       else if (fastafile == "not open") { abort = true; }     
+       
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not open") { abort = true; }  
+                       else if (groupfile == "not found") { groupfile = ""; }
+                       
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not open") { abort = true; }
+                       else if (namefile == "not found") { namefile = ""; }    
+
+                       alignreport = validParameter.validFile(parameters, "alignreport", true);
+                       if (alignreport == "not open") { abort = true; }
+                       else if (alignreport == "not found") { alignreport = ""; }      
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
+                               outputDir = ""; 
+                               outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it  
+                       }
+
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       string temp;
+                       temp = validParameter.validFile(parameters, "start", false);            if (temp == "not found") { temp = "-1"; }
+                       convert(temp, startPos); 
+               
+                       temp = validParameter.validFile(parameters, "end", false);                      if (temp == "not found") { temp = "-1"; }
+                       convert(temp, endPos);  
+
+                       temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
+                       convert(temp, maxAmbig);  
+
+                       temp = validParameter.validFile(parameters, "maxhomop", false);         if (temp == "not found") { temp = "-1"; }
+                       convert(temp, maxHomoP);  
+
+                       temp = validParameter.validFile(parameters, "minlength", false);        if (temp == "not found") { temp = "-1"; }
+                       convert(temp, minLength); 
+                       
+                       temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "-1"; }
+                       convert(temp, maxLength); 
+               }
+
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the ScreenSeqsCommand class function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+}
+//**********************************************************************************************************************
+
+void ScreenSeqsCommand::help(){
+       try {
+               m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
+               m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, and group.\n");
+               m->mothurOut("The fasta parameter is required.\n");
+               m->mothurOut("The start parameter .... The default is -1.\n");
+               m->mothurOut("The end parameter .... The default is -1.\n");
+               m->mothurOut("The maxambig parameter .... The default is -1.\n");
+               m->mothurOut("The maxhomop parameter .... The default is -1.\n");
+               m->mothurOut("The minlength parameter .... The default is -1.\n");
+               m->mothurOut("The maxlength parameter .... The default is -1.\n");
+               m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
+               m->mothurOut("The screen.seqs command should be in the following format: \n");
+               m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig,  \n");
+               m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength)  \n");    
+               m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
+               m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ScreenSeqsCommand", "help");
                exit(1);
-       }       
+       }
 }
 
 //***************************************************************************************************************
@@ -35,88 +160,107 @@ ScreenSeqsCommand::~ScreenSeqsCommand(){  /*      do nothing      */      }
 
 int ScreenSeqsCommand::execute(){
        try{
-               int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength;
-               convert(globaldata->getStartPos(), startPos);
-               convert(globaldata->getEndPos(), endPos);
-               convert(globaldata->getMaxAmbig(), maxAmbig);
-               convert(globaldata->getMaxHomoPolymer(), maxHomoP);
-               convert(globaldata->getMinLength(), minLength);
-               convert(globaldata->getMaxLength(), maxLength);
                
+               if (abort == true) { return 0; }
+                               
                ifstream inFASTA;
-               openInputFile(globaldata->getFastaFile(), inFASTA);
+               openInputFile(fastafile, inFASTA);
                
                set<string> badSeqNames;
                
-               string goodSeqFile = getRootName(globaldata->getFastaFile()) + "good" + getExtension(globaldata->getFastaFile());
-               string badSeqFile = getRootName(globaldata->getFastaFile()) + "bad" + getExtension(globaldata->getFastaFile());
+               string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
+               string badSeqFile =  outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
                
                ofstream goodSeqOut;    openOutputFile(goodSeqFile, goodSeqOut);
                ofstream badSeqOut;             openOutputFile(badSeqFile, badSeqOut);          
                
                while(!inFASTA.eof()){
-                       Sequence currSeq(inFASTA);
-                       bool goodSeq = 1;               //      innocent until proven guilty
-                       if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
-                       if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
-                       if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
-                       if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
+                       if (m->control_pressed) { goodSeqOut.close(); badSeqOut.close(); inFASTA.close(); remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
                        
-                       if(goodSeq == 1){
-                               currSeq.printSequence(goodSeqOut);      
-                       }
-                       else{
-                               currSeq.printSequence(badSeqOut);       
-                               badSeqNames.insert(currSeq.getName());
+                       Sequence currSeq(inFASTA);
+                       if (currSeq.getName() != "") {
+                               bool goodSeq = 1;               //      innocent until proven guilty
+                               if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos())                  {       goodSeq = 0;    }
+                               if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos())                                {       goodSeq = 0;    }
+                               if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases())                {       goodSeq = 0;    }
+                               if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer())   {       goodSeq = 0;    }
+                               if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases())                {       goodSeq = 0;    }
+                               if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases())                {       goodSeq = 0;    }
+                               
+                               if(goodSeq == 1){
+                                       currSeq.printSequence(goodSeqOut);      
+                               }
+                               else{
+                                       currSeq.printSequence(badSeqOut);       
+                                       badSeqNames.insert(currSeq.getName());
+                               }
                        }
                        gobble(inFASTA);
-               }       
-               if(globaldata->getNameFile() != ""){
-                       screenNameGroupFile(badSeqNames);
-               }
-               else if(globaldata->getGroupFile() != ""){
-                       screenGroupFile(badSeqNames);
                }
+                       
+               if(namefile != "" && groupfile != "")   {       
+                       screenNameGroupFile(badSeqNames);       
+                       if (m->control_pressed) { goodSeqOut.close(); badSeqOut.close(); inFASTA.close(); remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
+               }else if(namefile != "")        {       
+                       screenNameGroupFile(badSeqNames);
+                       if (m->control_pressed) { goodSeqOut.close(); badSeqOut.close(); inFASTA.close(); remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }  
+               }else if(groupfile != "")                               {       screenGroupFile(badSeqNames);           }       // this screens just the group
+               
+               if (m->control_pressed) { goodSeqOut.close(); badSeqOut.close(); inFASTA.close(); remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
+
+               if(alignreport != "")                                   {       screenAlignReport(badSeqNames);         }
+               
+               goodSeqOut.close();
+               badSeqOut.close();
+               inFASTA.close();
+               
+               
+               if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; }
+
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               m->mothurOut(goodSeqFile); m->mothurOutEndLine();       
+               m->mothurOut(badSeqFile); m->mothurOutEndLine();        
+               for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+               m->mothurOutEndLine();
+
                
                return 0;
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }
-       catch(...) {
-               cout << "An unknown error has occurred in the ScreenSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               m->errorOut(e, "ScreenSeqsCommand", "execute");
                exit(1);
        }
-       
 }
 
 //***************************************************************************************************************
 
-void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
+int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
 
        ifstream inputNames;
-       openInputFile(globaldata->getNameFile(), inputNames);
+       openInputFile(namefile, inputNames);
        set<string> badSeqGroups;
        string seqName, seqList, group;
        set<string>::iterator it;
 
-       string goodNameFile = getRootName(globaldata->getNameFile()) + "good" + getExtension(globaldata->getNameFile());
-       string badNameFile = getRootName(globaldata->getNameFile()) + "bad" + getExtension(globaldata->getNameFile());
+       string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
+       string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
+       
+       outputNames.push_back(goodNameFile);  outputNames.push_back(badNameFile);
        
        ofstream goodNameOut;   openOutputFile(goodNameFile, goodNameOut);
        ofstream badNameOut;    openOutputFile(badNameFile, badNameOut);                
        
        while(!inputNames.eof()){
+               if (m->control_pressed) { goodNameOut.close(); badNameOut.close(); inputNames.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); return 0; }
+
                inputNames >> seqName >> seqList;
                it = badSeqNames.find(seqName);
                
                if(it != badSeqNames.end()){
                        badSeqNames.erase(it);
                        badNameOut << seqName << '\t' << seqList << endl;
-                       if(globaldata->getNameFile() != ""){
+                       if(namefile != ""){
                                int start = 0;
                                for(int i=0;i<seqList.length();i++){
                                        if(seqList[i] == ','){
@@ -136,18 +280,30 @@ void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
        goodNameOut.close();
        badNameOut.close();
        
-       if(globaldata->getGroupFile() != ""){
+       //we were unable to remove some of the bad sequences
+       if (badSeqNames.size() != 0) {
+               for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
+                       m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
+                       m->mothurOutEndLine();
+               }
+       }
+
+       if(groupfile != ""){
                
                ifstream inputGroups;
-               openInputFile(globaldata->getGroupFile(), inputGroups);
+               openInputFile(groupfile, inputGroups);
 
-               string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
-               string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+               string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
+               string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
+               
+               outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
                
                ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
                ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
                
                while(!inputGroups.eof()){
+                       if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str()); remove(badNameFile.c_str()); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
+
                        inputGroups >> seqName >> group;
 
                        it = badSeqGroups.find(seqName);
@@ -164,25 +320,40 @@ void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                inputGroups.close();
                goodGroupOut.close();
                badGroupOut.close();
+               
+               //we were unable to remove some of the bad sequences
+               if (badSeqGroups.size() != 0) {
+                       for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {  
+                               m->mothurOut("Your namefile does not include the sequence " + *it + " please correct."); 
+                               m->mothurOutEndLine();
+                       }
+               }
        }
+               
+       return 0;
+
 }
 
 //***************************************************************************************************************
 
-void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
+int ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
 
        ifstream inputGroups;
-       openInputFile(globaldata->getGroupFile(), inputGroups);
+       openInputFile(groupfile, inputGroups);
        string seqName, group;
        set<string>::iterator it;
        
-       string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
-       string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
+       string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
+       string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
+       
+       outputNames.push_back(goodGroupFile);  outputNames.push_back(badGroupFile);
        
        ofstream goodGroupOut;  openOutputFile(goodGroupFile, goodGroupOut);
        ofstream badGroupOut;   openOutputFile(badGroupFile, badGroupOut);              
        
        while(!inputGroups.eof()){
+               if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
+
                inputGroups >> seqName >> group;
                it = badSeqNames.find(seqName);
                
@@ -195,10 +366,93 @@ void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
                }
                gobble(inputGroups);
        }
+       
+       if (m->control_pressed) { goodGroupOut.close(); badGroupOut.close(); inputGroups.close(); remove(goodGroupFile.c_str()); remove(badGroupFile.c_str()); return 0; }
+
+       //we were unable to remove some of the bad sequences
+       if (badSeqNames.size() != 0) {
+               for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
+                       m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); 
+                       m->mothurOutEndLine();
+               }
+       }
+       
        inputGroups.close();
        goodGroupOut.close();
        badGroupOut.close();
        
+       if (m->control_pressed) { remove(goodGroupFile.c_str()); remove(badGroupFile.c_str());  }
+
+       
+       return 0;
+       
+}
+
+//***************************************************************************************************************
+
+int ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
+       
+       ifstream inputAlignReport;
+       openInputFile(alignreport, inputAlignReport);
+       string seqName, group;
+       set<string>::iterator it;
+       
+       string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
+       string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
+       
+       outputNames.push_back(goodAlignReportFile);  outputNames.push_back(badAlignReportFile);
+       
+       ofstream goodAlignReportOut;    openOutputFile(goodAlignReportFile, goodAlignReportOut);
+       ofstream badAlignReportOut;             openOutputFile(badAlignReportFile, badAlignReportOut);          
+
+       while (!inputAlignReport.eof()) {               //      need to copy header
+               char c = inputAlignReport.get();
+               goodAlignReportOut << c;
+               badAlignReportOut << c;
+               if (c == 10 || c == 13){        break;  }       
+       }
+
+       while(!inputAlignReport.eof()){
+               if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
+
+               inputAlignReport >> seqName;
+               it = badSeqNames.find(seqName);
+               string line;            
+               while (!inputAlignReport.eof()) {               //      need to copy header
+                       char c = inputAlignReport.get();
+                       line += c;
+                       if (c == 10 || c == 13){        break;  }       
+               }
+               
+               if(it != badSeqNames.end()){
+                       badSeqNames.erase(it);
+                       badAlignReportOut << seqName << '\t' << line;;
+               }
+               else{
+                       goodAlignReportOut << seqName << '\t' << line;
+               }
+               gobble(inputAlignReport);
+       }
+       
+       if (m->control_pressed) { goodAlignReportOut.close(); badAlignReportOut.close(); inputAlignReport.close(); remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
+
+       //we were unable to remove some of the bad sequences
+       if (badSeqNames.size() != 0) {
+               for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {  
+                       m->mothurOut("Your file does not include the sequence " + *it + " please correct."); 
+                       m->mothurOutEndLine();
+               }
+       }
+
+       inputAlignReport.close();
+       goodAlignReportOut.close();
+       badAlignReportOut.close();
+                       
+       if (m->control_pressed) {  remove(goodAlignReportFile.c_str()); remove(badAlignReportFile.c_str()); return 0; }
+       
+       return 0;
+
+       
 }
 
 //***************************************************************************************************************