From: westcott Date: Fri, 7 May 2010 12:39:04 +0000 (+0000) Subject: changed groupfile in classify.seqs to reflect multiple fasta files X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5c80ce8b80938d41cf6c64a017fa6fd50d45de5b changed groupfile in classify.seqs to reflect multiple fasta files --- diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 0538bff..bdf059c 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -82,9 +82,46 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } else if (templateFileName == "not open") { abort = true; } - groupfile = validParameter.validFile(parameters, "group", true); - if (groupfile == "not open") { abort = true; } - else if (groupfile == "not found") { groupfile = ""; } + groupfile = validParameter.validFile(parameters, "group", false); + if (groupfile == "not found") { groupfile = ""; } + else { + splitAtDash(groupfile, groupfileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < groupfileNames.size(); i++) { + if (inputDir != "") { + string path = hasPath(groupfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; } + } + int ableToOpen; + + #ifdef USE_MPI + int pid; + MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + + ifstream in; + ableToOpen = openInputFile(groupfileNames[i], in); + in.close(); + + #ifdef USE_MPI + for (int j = 1; j < processors; j++) { + MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Status status; + MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + + #endif + if (ableToOpen == 1) { m->mothurOut("Unable to match group file with fasta file."); m->mothurOutEndLine(); abort = true; } + + } + } fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; } @@ -193,6 +230,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + if (groupfile != "") { + if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); } + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; @@ -507,7 +548,7 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); - PhyloSummary taxaSum(taxonomyFileName, groupfile); + PhyloSummary taxaSum(taxonomyFileName, groupfileNames[s]); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } @@ -527,7 +568,7 @@ int ClassifySeqsCommand::execute(){ m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); }else{ for (int i = 0; i < itNames->second; i++) { - taxaSum.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs + taxaSum.addSeqToTree(name, taxon); //add it as many times as there are identical seqs } } } @@ -582,7 +623,6 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); - } delete classify; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index d19f862..639ae65 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -43,6 +43,7 @@ private: vector lines; vector fastaFileNames; vector namefileNames; + vector groupfileNames; map nameMap; map::iterator itNames; diff --git a/engine.cpp b/engine.cpp index b68bf61..9e5801b 100644 --- a/engine.cpp +++ b/engine.cpp @@ -99,15 +99,6 @@ bool InteractEngine::getInput(){ /***********************************************************************/ string Engine::getCommand() { try { - #ifdef USE_MPI //mpirun doesn't work with readline - string nextCommand = ""; - - mout->mothurOut("mothur > "); - getline(cin, nextCommand); - mout->mothurOutJustToLog(toString(nextCommand)); - - return nextCommand; - #else #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) #ifdef USE_READLINE char* nextCommand = NULL; @@ -138,7 +129,7 @@ string Engine::getCommand() { return nextCommand; #endif - #endif + } catch(exception& e) { diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 28e1373..41c3219 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -339,6 +339,8 @@ int FilterSeqsCommand::filterSequences() { lines.push_back(new linePair(0, numFastaSeqs)); + numSeqs += numFastaSeqs; + driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); }else{ setLines(fastafileNames[s]); @@ -362,6 +364,8 @@ int FilterSeqsCommand::filterSequences() { lines.push_back(new linePair(0, numFastaSeqs)); + numSeqs += numFastaSeqs; + driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]); if (m->control_pressed) { return 1; } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index f40c89b..7fdf6ed 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -123,6 +123,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { int okay = 2; if (outputDir != "") { okay++; } + if (inputDir != "") { okay++; } if (parameters.size() > okay) { m->mothurOut("You may only enter one of the following: fasta, name, group, alignreport or listfile."); m->mothurOutEndLine(); abort = true; } } diff --git a/hcluster.cpp b/hcluster.cpp index 07deaa5..9466483 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -452,7 +452,7 @@ vector HCluster::getSeqsAN(){ if (distance != -1) { //-1 means skip me seqDist temp(firstName, secondName, distance); sameSeqs.push_back(temp); - } + }else{ distance = 10000; } } if (mergedMinDist < distance) { //get minimum distance from mergedMin diff --git a/phylosummary.cpp b/phylosummary.cpp index 915a05f..5612d7b 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -114,6 +114,8 @@ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ //find out the sequences group string group = groupmap->getGroup(seqName); + if (group == "not found") { m->mothurOut(seqName + " is not in your groupfile, and will be included in the overall total, but not any group total."); m->mothurOutEndLine(); } + //do you have a count for this group? map::iterator itGroup = tree[currentNode].groupCount.find(group); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index bb45063..d72ada4 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -8,6 +8,8 @@ */ #include "trimseqscommand.h" +#include "needlemanoverlap.hpp" +#include "nast.hpp" //*************************************************************************************************************** @@ -22,7 +24,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qaverage", "allfiles", "qtrim", "processors", "outputdir","inputdir"}; + "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -103,6 +105,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); + temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, diffs); + temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } else if(temp == "not open") { abort = true; } @@ -148,7 +153,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { void TrimSeqsCommand::help(){ try { m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); - m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, qtrim and allfiles.\n"); + m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); m->mothurOut("The fasta parameter is required.\n"); m->mothurOut("The flip parameter .... The default is 0.\n"); m->mothurOut("The oligos parameter .... The default is "".\n"); @@ -156,6 +161,7 @@ void TrimSeqsCommand::help(){ m->mothurOut("The maxhomop parameter .... The default is 0.\n"); m->mothurOut("The minlength parameter .... The default is 0.\n"); m->mothurOut("The maxlength parameter .... The default is 0.\n"); + m->mothurOut("The diffs parameter .... The default is 0.\n"); m->mothurOut("The qfile parameter .....\n"); m->mothurOut("The qthreshold parameter .... The default is 0.\n"); m->mothurOut("The qaverage parameter .... The default is 0.\n"); @@ -579,9 +585,7 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vectorerrorOut(e, "TrimSeqsCommand", "getOligos"); exit(1); } - } - //*************************************************************************************************************** bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ @@ -589,6 +593,7 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ string rawSequence = seq.getUnaligned(); bool success = 0; //guilty until proven innocent + //can you find the barcode for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length @@ -603,6 +608,39 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ break; } } + + //if you found the barcode or if you don't want to allow for diffs + if ((diffs == 0) || (success == 1)) { return success; } + + else { //try aligning and see if you can find it + //can you find the barcode + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = 0; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + Alignment* alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (oligo.length()+diffs+1)); + Sequence* templateSeq = new Sequence("temp", rawSequence.substr(0,(oligo.length()+diffs))); + Sequence* candidateSeq = new Sequence("temp2", oligo); + Nast nast(alignment, candidateSeq, templateSeq); + + oligo = candidateSeq->getAligned(); + cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,(oligo.length())) << endl; + delete alignment; + delete templateSeq; + delete candidateSeq; + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,oligo.length())); + success = 1; + break; + } + } + } return success; } @@ -635,6 +673,38 @@ bool TrimSeqsCommand::stripForward(Sequence& seq){ } } + //if you found the primer or if you don't want to allow for diffs + if ((diffs == 0) || (success == 1)) { return success; } + + else { //try aligning and see if you can find it + //can you find the primer + for(int i=0;igetAligned(); + + delete alignment; + delete templateSeq; + delete candidateSeq; + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + seq.setUnaligned(rawSequence.substr(0,oligo.length())); + success = 1; + break; + } + } + } + return success; } @@ -744,7 +814,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ for(int i=0;i forPrimer, revPrimer, outputNames; map barcodes; vector groupVector;