added summary outputs for align.check

[mothur.git] / screenseqscommand.cpp
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp

index 8a7a108eb322978377a9d09ffb880a2745c54caf..e6d29c190433d03ead4562f963c5416e7a1f65b8 100644 (file)
--- a/screenseqscommand.cpp
+++ b/screenseqscommand.cpp
@@ -26,8 +26,7 @@ vector<string> ScreenSeqsCommand::getValidParameters(){
  //**********************************************************************************************************************
  ScreenSeqsCommand::ScreenSeqsCommand(){        
         try {
-               abort = true;
-               //initialize outputTypes
+               abort = true; calledHelp = true; 
                 vector<string> tempOutNames;
                 outputTypes["fasta"] = tempOutNames;
                 outputTypes["name"] = tempOutNames;
@@ -67,10 +66,10 @@ vector<string> ScreenSeqsCommand::getRequiredFiles(){
  
  ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
         try {
-               abort = false;
+               abort = false; calledHelp = false;   
                 
                 //allow user to run help
-               if(option == "help") { help(); abort = true; }
+               if(option == "help") { help(); abort = true; calledHelp = true; }
                 
                 else {
                         //valid paramters for this command
@@ -145,7 +144,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option)  {
                         else if (groupfile == "not found") { groupfile = ""; }
                         
                         namefile = validParameter.validFile(parameters, "name", true);
-                       if (namefile == "not open") { abort = true; }
+                       if (namefile == "not open") { namefile = ""; abort = true; }
                         else if (namefile == "not found") { namefile = ""; }    
  
                         alignreport = validParameter.validFile(parameters, "alignreport", true);
@@ -247,11 +246,15 @@ ScreenSeqsCommand::~ScreenSeqsCommand(){  /*      do nothing      */      }
  int ScreenSeqsCommand::execute(){
         try{
                 
-               if (abort == true) { return 0; }
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-               //if the user want to optimize we need to no the 90% mark
+               //if the user want to optimize we need to know the 90% mark
                 vector<unsigned long int> positions;
-               if (optimize.size() != 0) {  getSummary(positions); } //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+               if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
+                       //use the namefile to optimize correctly
+                       if (namefile != "") { nameMap = m->readNames(namefile); }
+                       getSummary(positions); 
+               } 
                 else { 
                         positions = m->divideFile(fastafile, processors);
                         for (int i = 0; i < (positions.size()-1); i++) {
@@ -267,7 +270,7 @@ int ScreenSeqsCommand::execute(){
                 int start = time(NULL);
                 
  #ifdef USE_MPI 
-                       int pid, end, numSeqsPerProcessor; 
+                       int pid, numSeqsPerProcessor; 
                         int tag = 2001;
                         vector<unsigned long int> MPIPos;
                         
@@ -507,13 +510,13 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                 outputNames.push_back(goodNameFile);  outputTypes["name"].push_back(goodNameFile);
                 
                 ofstream goodNameOut;   m->openOutputFile(goodNameFile, goodNameOut);
-       
+               
                 while(!inputNames.eof()){
                         if (m->control_pressed) { goodNameOut.close();  inputNames.close(); remove(goodNameFile.c_str());  return 0; }
  
                         inputNames >> seqName >> seqList;
                         it = badSeqNames.find(seqName);
-                       
+                               
                         if(it != badSeqNames.end()){
                                 badSeqNames.erase(it);
                                 
@@ -558,7 +561,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                                 if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); remove(goodNameFile.c_str());  remove(goodGroupFile.c_str()); return 0; }
  
                                 inputGroups >> seqName >> group;
-
+                               
                                 it = badSeqGroups.find(seqName);
                                 
                                 if(it != badSeqGroups.end()){
@@ -580,7 +583,8 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
                                 }
                         }
                 }
-                       
+               
+               
                 return 0;
         
         }
@@ -625,14 +629,15 @@ int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
                 sort(ambigBases.begin(), ambigBases.end());
                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
                 
-               int criteriaPercentile  = int(numSeqs * (criteria / (float) 100));
+               //numSeqs is the number of unique seqs, startPosition.size() is the total number of seqs, we want to optimize using all seqs
+               int criteriaPercentile  = int(startPosition.size() * (criteria / (float) 100));
                 
                 for (int i = 0; i < optimize.size(); i++) {
                         if (optimize[i] == "start") { startPos = startPosition[criteriaPercentile]; m->mothurOut("Optimizing start to " + toString(startPos) + "."); m->mothurOutEndLine(); }
-                       else if (optimize[i] == "end") { int endcriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
+                       else if (optimize[i] == "end") { int endcriteriaPercentile = int(endPosition.size() * ((100 - criteria) / (float) 100));  endPos = endPosition[endcriteriaPercentile]; m->mothurOut("Optimizing end to " + toString(endPos) + "."); m->mothurOutEndLine();}
                         else if (optimize[i] == "maxambig") { maxAmbig = ambigBases[criteriaPercentile]; m->mothurOut("Optimizing maxambig to " + toString(maxAmbig) + "."); m->mothurOutEndLine(); }
                         else if (optimize[i] == "maxhomop") { maxHomoP = longHomoPolymer[criteriaPercentile]; m->mothurOut("Optimizing maxhomop to " + toString(maxHomoP) + "."); m->mothurOutEndLine(); }
-                       else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(numSeqs * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
+                       else if (optimize[i] == "minlength") { int mincriteriaPercentile = int(seqLength.size() * ((100 - criteria) / (float) 100)); minLength = seqLength[mincriteriaPercentile]; m->mothurOut("Optimizing minlength to " + toString(minLength) + "."); m->mothurOutEndLine(); }
                         else if (optimize[i] == "maxlength") { maxLength = seqLength[criteriaPercentile]; m->mothurOut("Optimizing maxlength to " + toString(maxLength) + "."); m->mothurOutEndLine(); }
                 }
  
@@ -662,11 +667,24 @@ int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                         Sequence current(in); m->gobble(in);
         
                         if (current.getName() != "") {
-                               startPosition.push_back(current.getStartPos());
-                               endPosition.push_back(current.getEndPos());
-                               seqLength.push_back(current.getNumBases());
-                               ambigBases.push_back(current.getAmbigBases());
-                               longHomoPolymer.push_back(current.getLongHomoPolymer());
+                               int num = 1;
+                               if (namefile != "") {
+                                       //make sure this sequence is in the namefile, else error 
+                                       map<string, int>::iterator it = nameMap.find(current.getName());
+                                       
+                                       if (it == nameMap.end()) { m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                                       else { num = it->second; }
+                               }
+                               
+                               //for each sequence this sequence represents
+                               for (int i = 0; i < num; i++) {
+                                       startPosition.push_back(current.getStartPos());
+                                       endPosition.push_back(current.getEndPos());
+                                       seqLength.push_back(current.getNumBases());
+                                       ambigBases.push_back(current.getAmbigBases());
+                                       longHomoPolymer.push_back(current.getLongHomoPolymer());
+                               }
+                               
                                 count++;
                         }
                         
@@ -712,6 +730,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition,
                                 m->openOutputFile(tempFile, out);
                                 
                                 out << num << endl;
+                               out << startPosition.size() << endl;
                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
@@ -721,7 +740,11 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition,
                                 out.close();
                                 
                                 exit(0);
-                       }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
                 }
                 
                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
@@ -740,6 +763,7 @@ int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition,
                         
                         int temp, tempNum;
                         in >> tempNum; m->gobble(in); num += tempNum;
+                       in >> tempNum; m->gobble(in);
                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
@@ -1007,6 +1031,9 @@ int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File&
                                         delete buf3;
                                 }
                         }
+                       
+                       //report progress
+                       if((i) % 100 == 0){     m->mothurOut("Processing sequence: " + toString(i)); m->mothurOutEndLine();             }
                 }
                                 
                 return 1;
@@ -1043,7 +1070,11 @@ int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, st
                                 out.close();
                                 
                                 exit(0);
-                       }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
                 }
                 
                 //force parent to wait until all the processes are done