]> git.donarmstrong.com Git - mothur.git/blobdiff - screenseqscommand.cpp
added summary outputs for align.check
[mothur.git] / screenseqscommand.cpp
index 98b6681d5091ffbb30b80172c0a36d12a531d130..e6d29c190433d03ead4562f963c5416e7a1f65b8 100644 (file)
@@ -26,8 +26,7 @@ vector<string> ScreenSeqsCommand::getValidParameters(){
 //**********************************************************************************************************************
 ScreenSeqsCommand::ScreenSeqsCommand(){        
        try {
-               abort = true;
-               //initialize outputTypes
+               abort = true; calledHelp = true; 
                vector<string> tempOutNames;
                outputTypes["fasta"] = tempOutNames;
                outputTypes["name"] = tempOutNames;
@@ -67,10 +66,10 @@ vector<string> ScreenSeqsCommand::getRequiredFiles(){
 
 ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
        try {
-               abort = false;
+               abort = false; calledHelp = false;   
                
                //allow user to run help
-               if(option == "help") { help(); abort = true; }
+               if(option == "help") { help(); abort = true; calledHelp = true; }
                
                else {
                        //valid paramters for this command
@@ -145,7 +144,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                        else if (groupfile == "not found") { groupfile = ""; }
                        
                        namefile = validParameter.validFile(parameters, "name", true);
-                       if (namefile == "not open") { abort = true; }
+                       if (namefile == "not open") { namefile = ""; abort = true; }
                        else if (namefile == "not found") { namefile = ""; }    
 
                        alignreport = validParameter.validFile(parameters, "alignreport", true);
@@ -247,11 +246,15 @@ ScreenSeqsCommand::~ScreenSeqsCommand(){  /*      do nothing      */      }
 int ScreenSeqsCommand::execute(){
        try{
                
-               if (abort == true) { return 0; }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                
-               //if the user want to optimize we need to no the 90% mark
+               //if the user want to optimize we need to know the 90% mark
                vector<unsigned long int> positions;
-               if (optimize.size() != 0) {  getSummary(positions); } //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+               if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+                       //use the namefile to optimize correctly
+                       if (namefile != "") { nameMap = m->readNames(namefile); }
+                       getSummary(positions); 
+               } 
                else { 
                        positions = m->divideFile(fastafile, processors);
                        for (int i = 0; i < (positions.size()-1); i++) {
@@ -267,7 +270,7 @@ int ScreenSeqsCommand::execute(){
                int start = time(NULL);
                
 #ifdef USE_MPI 
-                       int pid, end, numSeqsPerProcessor; 
+                       int pid, numSeqsPerProcessor; 
                        int tag = 2001;
                        vector<unsigned long int> MPIPos;
                        
@@ -507,13 +510,13 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
                
                ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
-       
+               
                while(!inputNames.eof()){
                        if (m->control_pressed) { goodNameOut.close();  inputNames.close(); remove(goodNameFile.c_str());  return 0; }
 
                        inputNames >> seqName >> seqList;
                        it = badSeqNames.find(seqName);
-                       
+                               
                        if(it != badSeqNames.end()){
                                badSeqNames.erase(it);
                                
@@ -558,7 +561,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                                if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str());  remove(goodGroupFile.c_str()); return 0; }
 
                                inputGroups >> seqName >> group;
-
+                               
                                it = badSeqGroups.find(seqName);
                                
                                if(it != badSeqGroups.end()){
@@ -580,7 +583,8 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                                }
                        }
                }
-                       
+               
+               
                return 0;
        
        }
@@ -625,14 +629,15 @@ int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
                sort(ambigBases.begin(), ambigBases.end());
                sort(longHomoPolymer.begin(), longHomoPolymer.end());
                
-               int criteriaPercentile  = int(numSeqs * (criteria / (float) 100));
+               //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
+               int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
                
                for (int i = 0; i < optimize.size(); i++) {
                        if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
-                       else if (optimize[i] == "end") { int endcriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
+                       else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
                        else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
                        else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
-                       else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
+                       else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
                        else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
                }
 
@@ -662,11 +667,24 @@ int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                        Sequence current(in); m->gobble(in);
        
                        if (current.getName() != "") {
-                               startPosition.push_back(current.getStartPos());
-                               endPosition.push_back(current.getEndPos());
-                               seqLength.push_back(current.getNumBases());
-                               ambigBases.push_back(current.getAmbigBases());
-                               longHomoPolymer.push_back(current.getLongHomoPolymer());
+                               int num = 1;
+                               if (namefile != "") {
+                                       //make sure this sequence is in the namefile, else error 
+                                       map<string, int>::iterator it = nameMap.find(current.getName());
+                                       
+                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                                       else { num = it->second; }
+                               }
+                               
+                               //for each sequence this sequence represents
+                               for (int i = 0; i < num; i++) {
+                                       startPosition.push_back(current.getStartPos());
+                                       endPosition.push_back(current.getEndPos());
+                                       seqLength.push_back(current.getNumBases());
+                                       ambigBases.push_back(current.getAmbigBases());
+                                       longHomoPolymer.push_back(current.getLongHomoPolymer());
+                               }
+                               
                                count++;
                        }
                        
@@ -712,6 +730,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition,
                                m->openOutputFile(tempFile, out);
                                
                                out << num << endl;
+                               out << startPosition.size() << endl;
                                for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
                                for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
                                for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
@@ -744,6 +763,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition,
                        
                        int temp, tempNum;
                        in >> tempNum; m->gobble(in); num += tempNum;
+                       in >> tempNum; m->gobble(in);
                        for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
                        for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
                        for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);