]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.cpp
worked on trim.seqs - added in the groupfiles for allfiles=t, cleaned up the outputNa...
[mothur.git] / trimseqscommand.cpp
index 2ad4f0607a24c23506109745553d608544a43d6c..abfe15a17a18d9e203be7bc4bfaac0fd0e6b4a0f 100644 (file)
@@ -14,7 +14,7 @@
 
 vector<string> TrimSeqsCommand::getValidParameters(){  
        try {
-               string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
+               string Array[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop","minlength", "maxlength", "qfile", 
                                                                        "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
                                                                        "keepfirst", "removelast",
                                                                        "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
@@ -34,7 +34,7 @@ TrimSeqsCommand::TrimSeqsCommand(){
                abort = true; calledHelp = true; 
                vector<string> tempOutNames;
                outputTypes["fasta"] = tempOutNames;
-               outputTypes["qual"] = tempOutNames;
+               outputTypes["qfile"] = tempOutNames;
                outputTypes["group"] = tempOutNames;
        }
        catch(exception& e) {
@@ -83,7 +83,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                
                else {
                        //valid paramters for this command
-                       string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", 
+                       string AlignArray[] =  {        "fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
                                                                "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage",
                                                                "keepfirst", "removelast",
                                                                "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"};
@@ -104,7 +104,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                        //initialize outputTypes
                        vector<string> tempOutNames;
                        outputTypes["fasta"] = tempOutNames;
-                       outputTypes["qual"] = tempOutNames;
+                       outputTypes["qfile"] = tempOutNames;
                        outputTypes["group"] = tempOutNames;
                        
                        //if the user changes the input directory command factory will send this info to us in the output parameter 
@@ -136,13 +136,6 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                                        if (path == "") {       parameters["qfile"] = inputDir + it->second;            }
                                }
                                
-                               it = parameters.find("group");
-                               //user has given a template file
-                               if(it != parameters.end()){ 
-                                       path = m->hasPath(it->second);
-                                       //if the user has not given a path then, add inputdir. else leave path alone.
-                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
-                               }
                        }
 
                        
@@ -170,10 +163,6 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                        else if(temp == "not open"){    abort = true;   } 
                        else                                    {       oligoFile = temp;               }
                        
-                       temp = validParameter.validFile(parameters, "group", true);
-                       if (temp == "not found"){       groupfile = "";         }
-                       else if(temp == "not open"){    abort = true;   } 
-                       else                                    {       groupfile = temp;               }
                        
                        temp = validParameter.validFile(parameters, "maxambig", false);         if (temp == "not found") { temp = "-1"; }
                        convert(temp, maxAmbig);  
@@ -236,13 +225,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                        temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found") { temp = "1"; }
                        convert(temp, processors); 
                        
-                       if ((oligoFile != "") && (groupfile != "")) {
-                               m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true;
-                       }
-                                                                                               
                        
-                       if(allFiles && (oligoFile == "") && (groupfile == "")){
-                               m->mothurOut("You selected allfiles, but didn't enter an oligos or group file.  Ignoring the allfiles request."); m->mothurOutEndLine();
+                       if(allFiles && (oligoFile == "")){
+                               m->mothurOut("You selected allfiles, but didn't enter an oligos.  Ignoring the allfiles request."); m->mothurOutEndLine();
                        }
                        if((qAverage != 0 && qThreshold != 0) && qFileName == ""){
                                m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine();
@@ -268,9 +253,8 @@ void TrimSeqsCommand::help(){
        try {
                m->mothurOut("The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n");
                m->mothurOut("The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n");
-               m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
+               m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n");
                m->mothurOut("The fasta parameter is required.\n");
-               m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n");
                m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
                m->mothurOut("The oligos parameter allows you to provide an oligos file.\n");
                m->mothurOut("The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n");
@@ -333,15 +317,14 @@ int TrimSeqsCommand::execute(){
                if (qFileName != "") {
                        outputNames.push_back(trimQualFile);
                        outputNames.push_back(scrapQualFile);
-                       outputTypes["qual"].push_back(trimQualFile);
-                       outputTypes["qual"].push_back(scrapQualFile); 
+                       outputTypes["qfile"].push_back(trimQualFile);
+                       outputTypes["qfile"].push_back(scrapQualFile); 
                }
                
                string outputGroupFileName;
-
                if(oligoFile != ""){
                        outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups";
-                       outputNames.push_back(outputGroupFileName); outputTypes["groups"].push_back(outputGroupFileName);
+                       outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
                        getOligos(fastaFileNames, qualFileNames);
                }
 
@@ -363,33 +346,73 @@ int TrimSeqsCommand::execute(){
                                        createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); 
                                }       
                #else
-                               driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
+                               driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]);
                #endif
                
                if (m->control_pressed) {  return 0; }                  
                        
-                               
                if(allFiles){
+                       //clear out all old group files
+                       map<string, string> uniqueFastaNames;// so we don't add the same groupfile multiple times
+                       map<string, string>::iterator it;
+                       set<string> namesToRemove;
                        for(int i=0;i<fastaFileNames.size();i++){
                                for(int j=0;j<fastaFileNames[0].size();j++){
-                                       if(m->isBlank(fastaFileNames[i][j])){
-                                               remove(fastaFileNames[i][j].c_str());
-
-                                               if(qFileName != ""){
+                                       if (fastaFileNames[i][j] != "") {
+                                               if(m->isBlank(fastaFileNames[i][j])){
                                                        remove(fastaFileNames[i][j].c_str());
+                                                       namesToRemove.insert(fastaFileNames[i][j]);
+                                                       
+                                                       if(qFileName != ""){
+                                                               remove(qualFileNames[i][j].c_str());
+                                                               namesToRemove.insert(qualFileNames[i][j]);
+                                                       }
+                                               }else{  
+                                                       it = uniqueFastaNames.find(fastaFileNames[i][j]);
+                                                       if (it == uniqueFastaNames.end()) {     
+                                                               uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i];  
+                                                       }       
                                                }
-
                                        }
                                }
                        }
+                       
+                       //remove names for outputFileNames, just cleans up the output
+                       vector<string> outputNames2;
+                       for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } }
+                       outputNames = outputNames2;
+                       
+                       for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) {
+                               ifstream in;
+                               m->openInputFile(it->first, in);
+                               
+                               ofstream out;
+                               string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups";
+                               outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName);
+                               m->openOutputFile(thisGroupName, out);
+                               
+                               while (!in.eof()){
+                                       if (m->control_pressed) { break; }
+                                       
+                                       Sequence currSeq(in); m->gobble(in);
+                                       out << currSeq.getName() << '\t' << it->second << endl;
+                               }
+                               in.close();
+                               out.close();
+                       }
                }
                
-               
-               
-               if (m->control_pressed) { 
-                       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); }
-                       return 0;
+               if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
+
+               //output group counts
+               m->mothurOutEndLine();
+               int total = 0;
+               for (int i = 0; i < barcodeNameVector.size(); i++) {
+                       if ((barcodeNameVector[i] != "") && (groupCounts[i] != 0)) { total += groupCounts[i]; m->mothurOut("Group " + barcodeNameVector[i] + " contains " + toString(groupCounts[i]) + " sequences."); m->mothurOutEndLine(); }
                }
+               if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); }
+               
+                       if (m->control_pressed) {       for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str()); } return 0;     }
 
                m->mothurOutEndLine();
                m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -426,14 +449,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                ofstream outGroupsFile;
                if (oligoFile != ""){   m->openOutputFile(groupFileName, outGroupsFile);   }
-               
                if(allFiles){
                        for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file
                                for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file
-                                       ofstream temp;
-                                       m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
-                                       if(qFileName != ""){
-                                               m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
+                                       if (fastaFileNames[i][j] != "") {
+                                               ofstream temp;
+                                               m->openOutputFile(fastaFileNames[i][j], temp);                  temp.close();
+                                               if(qFileName != ""){
+                                                       m->openOutputFile(qualFileNames[i][j], temp);                   temp.close();
+                                               }
                                        }
                                }
                        }
@@ -474,7 +498,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
 
                        QualityScores currQual;
                        if(qFileName != ""){
-                               currQual = QualityScores(qFile, currSeq.getNumBases());  m->gobble(qFile);
+                               currQual = QualityScores(qFile);  m->gobble(qFile);
                        }
 
                        string origSeq = currSeq.getUnaligned();
@@ -557,6 +581,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        
                                        if(barcodes.size() != 0){
                                                outGroupsFile << currSeq.getName() << '\t' << barcodeNameVector[barcodeIndex] << endl;
+                                               groupCounts[barcodeIndex]++;
                                        }
                                        
                                        
@@ -640,12 +665,14 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
 
                                        for(int i=0;i<tempFASTAFileNames.size();i++){
                                                for(int j=0;j<tempFASTAFileNames[i].size();j++){
-                                                       tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
-                                                       m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
-
-                                                       if(qFileName != ""){
-                                                               tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
-                                                               m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
+                                                       if (tempFASTAFileNames[i][j] != "") {
+                                                               tempFASTAFileNames[i][j] += toString(getpid()) + ".temp";
+                                                               m->openOutputFile(tempFASTAFileNames[i][j], temp);                      temp.close();
+
+                                                               if(qFileName != ""){
+                                                                       tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp";
+                                                                       m->openOutputFile(tempPrimerQualFileNames[i][j], temp);         temp.close();
+                                                               }
                                                        }
                                                }
                                        }
@@ -663,6 +690,15 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                                                 lines[process],
                                                                 qLines[process]);
                                
+                               //pass groupCounts to parent
+                               ofstream out;
+                               string tempFile = filename + toString(getpid()) + ".num.temp";
+                               m->openOutputFile(tempFile, out);
+                               for(int i = 0; i < groupCounts.size(); i++) {
+                                       out << groupCounts[i] << endl;
+                               }
+                               out.close();
+                               
                                exit(0);
                        }else { 
                                m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
@@ -678,11 +714,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                m->openOutputFile(trimQualFileName, temp);              temp.close();
                m->openOutputFile(scrapQualFileName, temp);             temp.close();
 
-               
-               
                driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
                
-               
                //force parent to wait until all the processes are done
                for (int i=0;i<processIDS.size();i++) { 
                        int temp = processIDS[i];
@@ -713,17 +746,31 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                        if(allFiles){
                                for(int j=0;j<fastaFileNames.size();j++){
                                        for(int k=0;k<fastaFileNames[j].size();k++){
-                                               m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
-                                               remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
-                                               
-                                               if(qFileName != ""){
-                                                       m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
-                                                       remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
+                                               if (fastaFileNames[j][k] != "") {
+                                                       m->appendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]);
+                                                       remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
+                                                       
+                                                       if(qFileName != ""){
+                                                               m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]);
+                                                               remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str());
+                                                       }
                                                }
                                        }
                                }
                        }
                        
+                       ifstream in;
+                       string tempFile =  filename + toString(processIDS[i]) + ".num.temp";
+                       m->openInputFile(tempFile, in);
+                       int count = 0; 
+                       int tempNum;
+                       while (!in.eof()) { 
+                               in >> tempNum; m->gobble(in);
+                               groupCounts[count] += tempNum; 
+                               count++;
+                       }
+                       in.close(); remove(tempFile.c_str());
+                       
                }
        
                return exitCommand;
@@ -910,6 +957,7 @@ void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                if(qFileName != ""){    qualFileNames = fastaFileNames; }
                
                if(allFiles){
+                       set<string> uniqueNames; //used to cleanup outputFileNames
                        for(map<string, int>::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){
                                for(map<string, int>::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){
                                        
@@ -934,15 +982,22 @@ void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
 
                                        ofstream temp;
                                        fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta";
-                                       outputNames.push_back(fastaFileName);
-                                       outputTypes["fasta"].push_back(fastaFileName);
+                                       if (uniqueNames.count(fastaFileName) == 0) {
+                                               outputNames.push_back(fastaFileName);
+                                               outputTypes["fasta"].push_back(fastaFileName);
+                                               uniqueNames.insert(fastaFileName);
+                                       }
+                                       
                                        fastaFileNames[itBar->second][itPrimer->second] = fastaFileName;
                                        m->openOutputFile(fastaFileName, temp);         temp.close();
 
                                        if(qFileName != ""){
                                                qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual";
-                                               outputNames.push_back(qualFileName);
-                                               outputTypes["qual"].push_back(qualFileName);
+                                               if (uniqueNames.count(fastaFileName) == 0) {
+                                                       outputNames.push_back(qualFileName);
+                                                       outputTypes["qfile"].push_back(qualFileName);
+                                               }
+                                               
                                                qualFileNames[itBar->second][itPrimer->second] = qualFileName;
                                                m->openOutputFile(qualFileName, temp);          temp.close();
                                        }
@@ -951,6 +1006,7 @@ void TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                }
                numFPrimers = primers.size();
                numRPrimers = revPrimer.size();
+               groupCounts.resize(barcodeNameVector.size(), 0);
 
        }
        catch(exception& e) {