]> git.donarmstrong.com Git - mothur.git/commitdiff
changed groupfile in classify.seqs to reflect multiple fasta files
authorwestcott <westcott>
Fri, 7 May 2010 12:39:04 +0000 (12:39 +0000)
committerwestcott <westcott>
Fri, 7 May 2010 12:39:04 +0000 (12:39 +0000)
classifyseqscommand.cpp
classifyseqscommand.h
engine.cpp
filterseqscommand.cpp
getseqscommand.cpp
hcluster.cpp
phylosummary.cpp
trimseqscommand.cpp
trimseqscommand.h

index 0538bff07d99ca1266d488a5844fcde49afaf3bc..bdf059c6a063aadfa2256cfe2368d12c7b69bc0e 100644 (file)
@@ -82,9 +82,46 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                        }
                        else if (templateFileName == "not open") { abort = true; }      
                        
-                       groupfile = validParameter.validFile(parameters, "group", true);
-                       if (groupfile == "not open") { abort = true; }  
-                       else if (groupfile == "not found") { groupfile = ""; }
+                       groupfile = validParameter.validFile(parameters, "group", false);
+                       if (groupfile == "not found") { groupfile = "";  }
+                       else { 
+                               splitAtDash(groupfile, groupfileNames);
+                               
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < groupfileNames.size(); i++) {
+                                       if (inputDir != "") {
+                                               string path = hasPath(groupfileNames[i]);
+                                               //if the user has not given a path then, add inputdir. else leave path alone.
+                                               if (path == "") {       groupfileNames[i] = inputDir + groupfileNames[i];               }
+                                       }
+                                       int ableToOpen;
+                                       
+                                       #ifdef USE_MPI  
+                                               int pid;
+                                               MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
+                                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                               
+                                               if (pid == 0) {
+                                       #endif
+
+                                       ifstream in;
+                                       ableToOpen = openInputFile(groupfileNames[i], in);
+                                       in.close();
+                                       
+                                       #ifdef USE_MPI  
+                                                       for (int j = 1; j < processors; j++) {
+                                                               MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
+                                                       }
+                                               }else{
+                                                       MPI_Status status;
+                                                       MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+                                               }
+                                               
+                                       #endif
+                                       if (ableToOpen == 1) {  m->mothurOut("Unable to match group file with fasta file."); m->mothurOutEndLine(); abort = true;       }
+                                       
+                               }
+                       }
                        
                        fastaFileName = validParameter.validFile(parameters, "fasta", false);
                        if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true;  }
@@ -193,6 +230,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                                if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
                        }
                        
+                       if (groupfile != "") {
+                               if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
+                       }
+                       
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        string temp;
@@ -507,7 +548,7 @@ int ClassifySeqsCommand::execute(){
                        m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
                        start = time(NULL);
                        
-                       PhyloSummary taxaSum(taxonomyFileName, groupfile);
+                       PhyloSummary taxaSum(taxonomyFileName, groupfileNames[s]);
                        
                        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0; }
                        
@@ -527,7 +568,7 @@ int ClassifySeqsCommand::execute(){
                                                m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
                                        }else{
                                                for (int i = 0; i < itNames->second; i++) { 
-                                                       taxaSum.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
+                                                       taxaSum.addSeqToTree(name, taxon);  //add it as many times as there are identical seqs
                                                }
                                        }
                                }
@@ -582,7 +623,6 @@ int ClassifySeqsCommand::execute(){
                        m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                        for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
                        m->mothurOutEndLine();
-                       
                }
                
                delete classify;
index d19f862d340983786f00c25eaf02a6df30e0dea8..639ae6543082a7162b270da16137078da5c8e362 100644 (file)
@@ -43,6 +43,7 @@ private:
        vector<linePair*> lines;
        vector<string> fastaFileNames;
        vector<string> namefileNames;
+       vector<string> groupfileNames;
        map<string, int> nameMap;
        map<string, int>::iterator itNames;
        
index b68bf61e4703fbd77133f8ba72c8488a78519d29..9e5801b1b057b2f2a1cc4a986e105ae3e73f14b5 100644 (file)
@@ -99,15 +99,6 @@ bool InteractEngine::getInput(){
 /***********************************************************************/
 string Engine::getCommand()  {
        try {
-       #ifdef USE_MPI //mpirun doesn't work with readline
-                               string nextCommand = "";
-                               
-                               mout->mothurOut("mothur > ");
-                               getline(cin, nextCommand);
-                               mout->mothurOutJustToLog(toString(nextCommand));
-                               
-                               return nextCommand;
-       #else
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                        #ifdef USE_READLINE
                                char* nextCommand = NULL;
@@ -138,7 +129,7 @@ string Engine::getCommand()  {
                                
                                return nextCommand;
                #endif
-       #endif
+       
                                                
        }
        catch(exception& e) {
index 28e137376c51261498a4d616ae929ba33b1ff3be..41c321933754ebb0a86dc279985a31711b31f629 100644 (file)
@@ -339,6 +339,8 @@ int FilterSeqsCommand::filterSequences() {
                                        
                                        lines.push_back(new linePair(0, numFastaSeqs));
                                        
+                                       numSeqs += numFastaSeqs;
+                                       
                                        driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
                                }else{
                                        setLines(fastafileNames[s]);
@@ -362,6 +364,8 @@ int FilterSeqsCommand::filterSequences() {
                                        
                                lines.push_back(new linePair(0, numFastaSeqs));
                                
+                               numSeqs += numFastaSeqs;
+                               
                                driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
 
                                if (m->control_pressed) {  return 1; }
index f40c89b3fb3eff837729bc203055d11ef1a3c5a5..7fdf6ed7084463b4e875c8fa01a4e4f9b13db513 100644 (file)
@@ -123,6 +123,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                        
                        int okay = 2;
                        if (outputDir != "") { okay++; }
+                       if (inputDir != "")     { okay++; }
                        
                        if (parameters.size() > okay) { m->mothurOut("You may only enter one of the following: fasta, name, group, alignreport or listfile."); m->mothurOutEndLine(); abort = true;  }
                }
index 07deaa5991c94627fa2e02a88e3b10b2729e290e..9466483c779ce0b32698c8469c6224ea3c63b724 100644 (file)
@@ -452,7 +452,7 @@ vector<seqDist> HCluster::getSeqsAN(){
                        if (distance != -1) { //-1 means skip me
                                seqDist temp(firstName, secondName, distance);
                                sameSeqs.push_back(temp);
-                       }
+                       }else{ distance = 10000; }
                }
                
                if (mergedMinDist < distance) { //get minimum distance from mergedMin
index 915a05ff74a14e7fac2a32da119c184ee9908293..5612d7bbf3eded6f86bcdce40c45048ee8edcb25 100644 (file)
@@ -114,6 +114,8 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
                                        //find out the sequences group
                                        string group = groupmap->getGroup(seqName);
                                        
+                                       if (group == "not found") {  m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine();  }
+                                       
                                        //do you have a count for this group?
                                        map<string, int>::iterator itGroup = tree[currentNode].groupCount.find(group);
                                        
index bb4506385774260141692e0ee5991ac5e10a9ee1..d72ada4cff4fb44cd557eede88d63f44a43c1cd6 100644 (file)
@@ -8,6 +8,8 @@
  */
 
 #include "trimseqscommand.h"
+#include "needlemanoverlap.hpp"
+#include "nast.hpp"
 
 //***************************************************************************************************************
 
@@ -22,7 +24,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                else {
                        //valid paramters for this command
                        string AlignArray[] =  {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", 
-                                                                       "qthreshold", "qaverage", "allfiles", "qtrim", "processors", "outputdir","inputdir"};
+                                                                       "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"};
                        
                        vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                        
@@ -103,6 +105,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
                        temp = validParameter.validFile(parameters, "maxlength", false);        if (temp == "not found") { temp = "0"; }
                        convert(temp, maxLength);
                        
+                       temp = validParameter.validFile(parameters, "diffs", false);            if (temp == "not found") { temp = "0"; }
+                       convert(temp, diffs);
+                       
                        temp = validParameter.validFile(parameters, "qfile", true);     
                        if (temp == "not found")        {       qFileName = "";         }
                        else if(temp == "not open")     {       abort = true;           }
@@ -148,7 +153,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option)  {
 void TrimSeqsCommand::help(){
        try {
                m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
-               m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, qtrim and allfiles.\n");
+               m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
                m->mothurOut("The fasta parameter is required.\n");
                m->mothurOut("The flip parameter .... The default is 0.\n");
                m->mothurOut("The oligos parameter .... The default is "".\n");
@@ -156,6 +161,7 @@ void TrimSeqsCommand::help(){
                m->mothurOut("The maxhomop parameter .... The default is 0.\n");
                m->mothurOut("The minlength parameter .... The default is 0.\n");
                m->mothurOut("The maxlength parameter .... The default is 0.\n");
+               m->mothurOut("The diffs parameter .... The default is 0.\n");
                m->mothurOut("The qfile parameter .....\n");
                m->mothurOut("The qthreshold parameter .... The default is 0.\n");
                m->mothurOut("The qaverage parameter .... The default is 0.\n");
@@ -579,9 +585,7 @@ void TrimSeqsCommand::getOligos(vector<string>& outFASTAVec){ //vector<ofstream*
                m->errorOut(e, "TrimSeqsCommand", "getOligos");
                exit(1);
        }
-
 }
-
 //***************************************************************************************************************
 
 bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
@@ -589,6 +593,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
                string rawSequence = seq.getUnaligned();
                bool success = 0;       //guilty until proven innocent
                
+               //can you find the barcode
                for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
                        string oligo = it->first;
                        if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
@@ -603,6 +608,39 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){
                                break;
                        }
                }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((diffs == 0) || (success == 1)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       //can you find the barcode
+                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                               string oligo = it->first;
+                               if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                                       success = 0;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               Alignment* alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (oligo.length()+diffs+1));
+                               Sequence* templateSeq = new Sequence("temp", rawSequence.substr(0,(oligo.length()+diffs)));
+                               Sequence* candidateSeq = new Sequence("temp2", oligo);
+                               Nast nast(alignment, candidateSeq, templateSeq);
+                               
+                               oligo = candidateSeq->getAligned();
+       cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,(oligo.length())) << endl;                   
+                               delete alignment;
+                               delete templateSeq;
+                               delete candidateSeq;
+                               
+                               if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                                       group = it->second;
+                                       seq.setUnaligned(rawSequence.substr(0,oligo.length()));
+                                       success = 1;
+                                       break;
+                               }
+                       }
+               }
                return success;
                
        }
@@ -635,6 +673,38 @@ bool TrimSeqsCommand::stripForward(Sequence& seq){
                        }
                }
                
+               //if you found the primer or if you don't want to allow for diffs
+               if ((diffs == 0) || (success == 1)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       //can you find the primer
+                       for(int i=0;i<numFPrimers;i++){
+                               string oligo = forPrimer[i];
+                               if(rawSequence.length() < oligo.length()){      
+                                       success = 0;
+                                       break;
+                               }
+                               
+                               //use needleman to align first primer.length()+numdiffs of sequence to each primer
+                               Alignment* alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (oligo.length()+diffs+1));
+                               Sequence* templateSeq = new Sequence("temp", rawSequence.substr(0,(oligo.length()+diffs)));
+                               Sequence* candidateSeq = new Sequence("temp2", oligo);
+                               Nast nast(alignment, candidateSeq, templateSeq);
+                               
+                               oligo = candidateSeq->getAligned();
+                               
+                               delete alignment;
+                               delete templateSeq;
+                               delete candidateSeq;
+                               
+                               if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                                       seq.setUnaligned(rawSequence.substr(0,oligo.length()));
+                                       success = 1;
+                                       break;
+                               }
+                       }
+               }
+               
                return success;
                
        }
@@ -744,7 +814,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
                for(int i=0;i<length;i++){
                        
                        if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
+                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
                                else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
                                else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
                                else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
@@ -757,7 +827,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
                                else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
                                else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
                                
-                               if(success == 0)        {       break;  }
+                               if(success == 0)        {       break;   }
                        }
                        else{
                                success = 1;
index 11778fb9583f5af72b7def6c5ce421d59802dde5..601cb6129d1ba5a32e54dad125e4689d9559d5d6 100644 (file)
@@ -44,7 +44,7 @@ private:
        string fastaFile, oligoFile, qFileName, outputDir;
        
        bool flip, allFiles, qtrim;
-       int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage, processors;
+       int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, qThreshold, qAverage, processors, diffs;
        vector<string> forPrimer, revPrimer, outputNames;
        map<string, int> barcodes;
        vector<string> groupVector;