From: westcott Date: Thu, 19 Jan 2012 12:44:14 +0000 (+0000) Subject: fixed bug with shhh.flow from file path name in write functions, added "smart" featur... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=16abd6271c455bd01b34ff89a2e3641bef0fa128 fixed bug with shhh.flow from file path name in write functions, added "smart" feature with namefile, added fasta and qfile parameters to fastq.info, added checking for reverse sequences in classifier. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c7c7cee..99454a9 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -2316,8 +2316,8 @@ GCC_OPTIMIZATION_LEVEL = 3; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../release\\\"\"", - "VERSION=\"\\\"1.22.0\\\"\"", - "RELEASE_DATE=\"\\\"10/12/2011\\\"\"", + "VERSION=\"\\\"1.23.0\\\"\"", + "RELEASE_DATE=\"\\\"1/9/2012\\\"\"", ); "GCC_VERSION[arch=*]" = ""; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; diff --git a/bayesian.cpp b/bayesian.cpp index b1f2c4c..f7ea6e4 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -12,12 +12,13 @@ #include "phylosummary.h" #include "referencedb.h" /**************************************************************************************************/ -Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid) : -Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { +Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f) : +Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { ReferenceDB* rdb = ReferenceDB::getInstance(); threadID = tid; + flip = f; string baseName = tempFile; if (baseName == "saved") { baseName = rdb->getSavedReference(); } @@ -78,13 +79,14 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (tfile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory... "); cout.flush();; wordGenusProb = rdb->wordGenusProb; + WordPairDiffArr = rdb->WordPairDiffArr; }else { m->mothurOut("Reading template probabilities... "); cout.flush(); readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); } //save probabilities - if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } }else{ //create search database and names vector @@ -108,11 +110,12 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //initialze probabilities wordGenusProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //cout << numKmers << '\t' << genusNodes.size() << endl; for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } //cout << numKmers << '\t' << genusNodes.size() << endl; - //ofstream out; - //ofstream out2; + ofstream out; + ofstream out2; #ifdef USE_MPI int pid; @@ -122,17 +125,17 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { #endif - //m->openOutputFile(probFileName, out); + m->openOutputFile(probFileName, out); //output mothur version - //out << "#" << m->getVersion() << endl; + out << "#" << m->getVersion() << endl; - //out << numKmers << endl; + out << numKmers << endl; - //m->openOutputFile(probFileName2, out2); + m->openOutputFile(probFileName2, out2); //output mothur version - //out2 << "#" << m->getVersion() << endl; + out2 << "#" << m->getVersion() << endl; #ifdef USE_MPI } @@ -149,7 +152,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << i << '\t'; + out << i << '\t'; #ifdef USE_MPI } @@ -168,7 +171,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1); float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1); - + diffPair tempProb(log(probabilityInTemplate), 0.0); + WordPairDiffArr[i] = tempProb; + int numNotZero = 0; for (int k = 0; k < genusNodes.size(); k++) { //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); @@ -184,7 +189,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << k << '\t' << wordGenusProb[i][k] << '\t'; + out << k << '\t' << wordGenusProb[i][k] << '\t' ; #ifdef USE_MPI } @@ -200,8 +205,8 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << endl; - //out2 << probabilityInTemplate << '\t' << numNotZero << endl; + out << endl; + out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl; #ifdef USE_MPI } @@ -214,8 +219,8 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out.close(); - //out2.close(); + out.close(); + out2.close(); #ifdef USE_MPI } @@ -228,10 +233,15 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); //save probabilities - if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } } } - + + generateWordPairDiffArr(); + + //save probabilities + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } + m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine(); } @@ -258,13 +268,13 @@ string Bayesian::getTaxonomy(Sequence* seq) { try { string tax = ""; Kmer kmer(kmerSize); + flipped = false; //get words contained in query //getKmerString returns a string where the index in the string is hte kmer number //and the character at that index can be converted to be the number of times that kmer was seen - string queryKmerString = kmer.getKmerString(seq->getUnaligned()); - + vector queryKmers; for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it if (queryKmerString[i] != '!') { //this kmer is in the query @@ -272,7 +282,22 @@ string Bayesian::getTaxonomy(Sequence* seq) { } } - if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; } + //if user wants to test reverse compliment and its reversed use that instead + if (flip) { + if (isReversed(queryKmers)) { + flipped = true; + seq->reverseComplement(); + queryKmerString = kmer.getKmerString(seq->getUnaligned()); + queryKmers.clear(); + for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it + if (queryKmerString[i] != '!') { //this kmer is in the query + queryKmers.push_back(i); + } + } + } + } + + if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); simpleTax = "unknown;"; return "unknown;"; } int index = getMostProbableTaxonomy(queryKmers); @@ -283,7 +308,7 @@ string Bayesian::getTaxonomy(Sequence* seq) { int numToSelect = queryKmers.size() / 8; tax = bootstrapResults(queryKmers, index, numToSelect); - + return tax; } catch(exception& e) { @@ -366,7 +391,8 @@ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { seqTax = phyloTree->get(seqTax.parent); } - if (confidenceTax == "") { confidenceTax = "unclassified;"; simpleTax = "unclassified;"; } + if (confidenceTax == "") { confidenceTax = "unknown;"; simpleTax = "unknown;"; } + return confidenceTax; } @@ -412,6 +438,46 @@ int Bayesian::getMostProbableTaxonomy(vector queryKmer) { exit(1); } } +//******************************************************************************************************************** +//if it is more probable that the reverse compliment kmers are in the template, then we assume the sequence is reversed. +bool Bayesian::isReversed(vector& queryKmers){ + try{ + bool reversed = false; + float prob = 0; + float reverseProb = 0; + + for (int i = 0; i < queryKmers.size(); i++){ + int kmer = queryKmers[i]; + if (kmer >= 0){ + prob += WordPairDiffArr[kmer].prob; + reverseProb += WordPairDiffArr[kmer].reverseProb; + } + } + + if (reverseProb > prob){ reversed = true; } + + return reversed; + } + catch(exception& e) { + m->errorOut(e, "Bayesian", "isReversed"); + exit(1); + } +} +//******************************************************************************************************************** +int Bayesian::generateWordPairDiffArr(){ + try{ + Kmer kmer(kmerSize); + for (int i = 0; i < WordPairDiffArr.size(); i++) { + int reversedWord = kmer.getReverseKmerNumber(i); + WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob; + } + + return 0; + }catch(exception& e) { + m->errorOut(e, "Bayesian", "generateWordPairDiffArr"); + exit(1); + } +} /************************************************************************************************* map Bayesian::parseTaxMap(string newTax) { try{ @@ -515,7 +581,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string int kmer, name; vector numbers; numbers.resize(numKmers); float prob; - vector zeroCountProb; zeroCountProb.resize(numKmers); + vector zeroCountProb; zeroCountProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //read version length = positions[1] - positions[0]; @@ -537,7 +604,10 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string delete buf4; istringstream iss (tempBuf,istringstream::in); - iss >> zeroCountProb[i] >> numbers[i]; + float probTemp; + iss >> zeroCountProb[i] >> numbers[i] >> probTemp; + WordPairDiffArr[i].prob = tempProb; + } MPI_File_close(&inMPI); @@ -585,13 +655,16 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string int kmer, name, count; count = 0; vector num; num.resize(numKmers); float prob; - vector zeroCountProb; zeroCountProb.resize(numKmers); + vector zeroCountProb; zeroCountProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //read version string line2 = m->getline(inNum); m->gobble(inNum); + float probTemp; //cout << threadID << '\t' << line2 << '\t' << this << endl; while (inNum) { - inNum >> zeroCountProb[count] >> num[count]; + inNum >> zeroCountProb[count] >> num[count] >> probTemp; + WordPairDiffArr[count].prob = probTemp; count++; m->gobble(inNum); //cout << threadID << '\t' << count << endl; diff --git a/bayesian.h b/bayesian.h index 1cf5145..7c88433 100644 --- a/bayesian.h +++ b/bayesian.h @@ -18,7 +18,7 @@ class Bayesian : public Classify { public: - Bayesian(string, string, string, int, int, int, int); + Bayesian(string, string, string, int, int, int, int, bool); ~Bayesian(); string getTaxonomy(Sequence*); @@ -30,12 +30,17 @@ private: vector genusTotals; vector genusNodes; //indexes in phyloTree where genus' are located + vector WordPairDiffArr; + int kmerSize, numKmers, confidenceThreshold, iters; string bootstrapResults(vector, int, int); int getMostProbableTaxonomy(vector); void readProbFile(ifstream&, ifstream&, string, string); bool checkReleaseDate(ifstream&, ifstream&, ifstream&, ifstream&); + bool isReversed(vector&); + vector createWordIndexArr(Sequence*); + int generateWordPairDiffArr(); }; diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp index 4977cad..2117daf 100644 --- a/binsequencecommand.cpp +++ b/binsequencecommand.cpp @@ -167,7 +167,7 @@ BinSeqCommand::BinSeqCommand(string option) { } namesfile = validParameter.validFile(parameters, "name", true); - if (namesfile == "not open") { abort = true; } + if (namesfile == "not open") { namesfile = ""; abort = true; } else if (namesfile == "not found") { namesfile = ""; } else { m->setNameFile(namesfile); } @@ -176,6 +176,11 @@ BinSeqCommand::BinSeqCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + if (namesfile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/classify.cpp b/classify.cpp index c92d9fd..3bf0d57 100644 --- a/classify.cpp +++ b/classify.cpp @@ -61,7 +61,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me names.push_back(temp.getName()); database->addSequence(temp); } -// database->generateDB(); + database->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); database->readKmerDB(kmerFileTest); @@ -150,7 +150,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } } -// database->generateDB(); + database->generateDB(); MPI_File_close(&inMPI); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else @@ -200,7 +200,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } fastaFile.close(); -// database->generateDB(); + database->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); @@ -223,9 +223,9 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me database->setNumSeqs(names.size()); //sanity check - //bool okay = phyloTree->ErrorCheck(names); + bool okay = phyloTree->ErrorCheck(names); - //if (!okay) { m->control_pressed = true; } + if (!okay) { m->control_pressed = true; } m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); @@ -238,7 +238,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } } /**************************************************************************************************/ -Classify::Classify() { m = MothurOut::getInstance(); database = NULL; } +Classify::Classify() { m = MothurOut::getInstance(); database = NULL; flipped=false; } /**************************************************************************************************/ int Classify::readTaxonomy(string file) { diff --git a/classify.h b/classify.h index 2e209f5..4e03547 100644 --- a/classify.h +++ b/classify.h @@ -30,6 +30,7 @@ public: virtual ~Classify(){}; virtual string getTaxonomy(Sequence*) = 0; virtual string getSimpleTax() { return simpleTax; } + virtual bool getFlipped() { return flipped; } virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float); virtual void setDistName(string s) {} //for knn, so if distance method is selected with knn you can create the smallest distance file in the right place. @@ -45,6 +46,7 @@ protected: string taxFile, templateFile, simpleTax; vector names; int threadID; + bool flip, flipped; int readTaxonomy(string); vector parseTax(string); diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index e28961b..c889637 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -182,7 +182,7 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { else if (refTaxonomy == "not open") { abort = true; } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -214,6 +214,11 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } + if (namefile == ""){ + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 9ee7b5e..328cd58 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -27,6 +27,7 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); + //CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); @@ -69,6 +70,7 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n"; @@ -89,6 +91,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ setParameters(); vector tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; } @@ -126,6 +129,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { outputTypes["taxonomy"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -440,15 +444,24 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = m->isTrue(temp); + //temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "T"; } + //flip = m->isTrue(temp); + flip = true; + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); search = "kmer"; } + + if (namefileNames.size() == 0){ + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } + } } @@ -470,12 +483,12 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); } + if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } else { m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); m->mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); } if (m->control_pressed) { delete classify; return 0; } @@ -494,6 +507,7 @@ int ClassifySeqsCommand::execute(){ if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy"; + string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "flip.accnos"; string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary"; @@ -503,6 +517,7 @@ int ClassifySeqsCommand::execute(){ } outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); + outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); int start = time(NULL); @@ -521,6 +536,7 @@ int ClassifySeqsCommand::execute(){ MPI_File inMPI; MPI_File outMPINewTax; MPI_File outMPITempTax; + MPI_File outMPIAcc; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -530,6 +546,9 @@ int ClassifySeqsCommand::execute(){ char outTempTax[1024]; strcpy(outTempTax, tempTaxonomyFile.c_str()); + + char outAcc[1024]; + strcpy(outAcc, newaccnosFile.c_str()); char inFileName[1024]; strcpy(inFileName, fastaFileNames[s].c_str()); @@ -537,8 +556,9 @@ int ClassifySeqsCommand::execute(){ MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); + MPI_File_open(MPI_COMM_WORLD, outAcc, outMode, MPI_INFO_NULL, &outMPIAcc); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } if (pid == 0) { //you are the root process @@ -557,9 +577,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } for (int i = 1; i < processors; i++) { int done; @@ -577,9 +597,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } int done = 0; MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); @@ -589,6 +609,7 @@ int ClassifySeqsCommand::execute(){ MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); + MPI_File_close(&outMPIAcc); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else @@ -613,16 +634,19 @@ int ClassifySeqsCommand::execute(){ } #endif if(processors == 1){ - numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); }else{ - numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); } #endif + + if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); - + + #ifdef USE_MPI if (pid == 0) { //this part does not need to be paralellized @@ -744,6 +768,12 @@ int ClassifySeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + current = ""; + itTypes = outputTypes.find("accnos"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } + } + delete classify; return 0; @@ -785,7 +815,7 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { /**************************************************************************************************/ -int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { +int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string accnos, string filename) { try { int num = 0; @@ -802,7 +832,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", filename); //pass numSeqs to parent ofstream out; @@ -820,7 +850,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, } //parent does its part - num = driver(lines[0], taxFileName, tempTaxFile, filename); + num = driver(lines[0], taxFileName, tempTaxFile, accnos, filename); //force parent to wait until all the processes are done for (int i=0;istart, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i); + classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flipThreshold); pDataArray.push_back(tempclass); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. @@ -861,7 +891,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, } //parent does its part - num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", filename); + num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", accnos + toString(processors-1) + ".temp", filename); processIDS.push_back((processors-1)); //Wait until all threads have terminated. @@ -879,8 +909,10 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, for(int i=0;imothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp")); m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp")); + m->mothurRemove((m->getFullPathName(accnos) + toString(processIDS[i]) + ".temp")); } return num; @@ -917,13 +949,16 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){ +int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string accnos, string filename){ try { ofstream outTax; m->openOutputFile(taxFName, outTax); ofstream outTaxSimple; m->openOutputFile(tempTFName, outTaxSimple); + + ofstream outAcc; + m->openOutputFile(accnos, outAcc); ifstream inFASTA; m->openInputFile(filename, inFASTA); @@ -936,7 +971,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT int count = 0; while (!done) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { + inFASTA.close(); + outTax.close(); + outTaxSimple.close(); + outAcc.close(); return 0; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); @@ -945,17 +984,20 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT taxonomy = classify->getTaxonomy(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 0; } - - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - }else{ - outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; - } - - outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } + + //output confidence scores or not + if (probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } + + if (classify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + + outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + count++; } delete candidateSeq; @@ -977,6 +1019,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT inFASTA.close(); outTax.close(); outTaxSimple.close(); + outAcc.close(); return count; } @@ -987,10 +1030,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT } //********************************************************************************************************************** #ifdef USE_MPI -int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, MPI_File& accFile, vector& MPIPos){ try { MPI_Status statusNew; MPI_Status statusTemp; + MPI_Status statusAcc; MPI_Status status; int pid; @@ -1018,29 +1062,40 @@ int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; - }else{ - outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - } - - int length = outputString.length(); - char* buf2 = new char[length]; - memcpy(buf2, outputString.c_str(), length); + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } - MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); - delete buf2; - + //output confidence scores or not + if (probs) { + outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; + }else{ outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - length = outputString.length(); - char* buf = new char[length]; - memcpy(buf, outputString.c_str(), length); + } + + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); + delete buf2; + + outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; + length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); - MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); - delete buf; + MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); + delete buf; + + if (classify->getFlipped()) { + outputString = candidateSeq->getName() + "\n"; + length = outputString.length(); + char* buf3 = new char[length]; + memcpy(buf3, outputString.c_str(), length); + + MPI_File_write_shared(accFile, buf3, length, MPI_CHAR, &statusAcc); + delete buf3; } + } delete candidateSeq; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 0e21a20..0bf4a91 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -72,16 +72,16 @@ private: string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs, save; + bool abort, probs, save, flip; - int driver(linePair*, string, string, string); + int driver(linePair*, string, string, string, string); void appendTaxFiles(string, string); - int createProcesses(string, string, string); + int createProcesses(string, string, string, string); string addUnclassifieds(string, int); int MPIReadNamesFile(string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif }; @@ -93,16 +93,17 @@ struct classifyData { string taxFName; string tempTFName; string filename; - string search, taxonomyFileName, templateFileName, method; + string search, taxonomyFileName, templateFileName, method, accnos; unsigned long long start; unsigned long long end; MothurOut* m; float match, misMatch, gapOpen, gapExtend; int count, kmerSize, threadID, cutoff, iters, numWanted; - bool probs; + bool probs, flip; classifyData(){} - classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) { + classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) { + accnos = acc; taxonomyFileName = tx; templateFileName = te; taxFName = a; @@ -124,6 +125,7 @@ struct classifyData { threadID = tid; probs = p; count = 0; + flip = fli; } }; @@ -141,6 +143,9 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ ofstream outTaxSimple; pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple); + ofstream outAcc; + pDataArray->m->openOutputFile(pDataArray->accnos, outAcc); + ifstream inFASTA; pDataArray->m->openInputFile(pDataArray->filename, inFASTA); @@ -157,12 +162,12 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; - if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); } - else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } + else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID, pDataArray->flipThreshold); } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); - myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } if (pDataArray->m->control_pressed) { delete myclassify; return 0; } @@ -180,16 +185,19 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; } - if (taxonomy != "bad seq") { - //output confidence scores or not - if (pDataArray->probs) { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - }else{ - outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; - } - - outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); } + + //output confidence scores or not + if (pDataArray->probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; } + + outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + + if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + count++; } delete candidateSeq; diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index ad1bd81..772113b 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -147,7 +147,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not found") { namefile = ""; } - else if (namefile == "not open") { abort = true; } + else if (namefile == "not open") { namefile = ""; abort = true; } else { readNameFile(); m->setNameFile(namefile); } string temp; @@ -157,6 +157,11 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { temp = validParameter.validFile(parameters, "percent", false); if (temp == "not found"){ temp = "0"; } m->mothurConvert(temp, percent); + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 787d84a..34caf65 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -201,7 +201,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); } taxFile = validParameter.validFile(parameters, "taxonomy", true); - if (taxFile == "not open") { abort = true; } + if (taxFile == "not open") { taxFile = ""; abort = true; } else if (taxFile == "not found") { taxFile = ""; } else { m->setTaxonomyFile(taxFile); } diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index 4f8c53b..223e5db 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -142,7 +142,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { }else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -163,7 +163,11 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } - + + if (namefile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/database.hpp b/database.hpp index 580c3f2..b2817a7 100644 --- a/database.hpp +++ b/database.hpp @@ -58,6 +58,7 @@ public: virtual void readKmerDB(ifstream&){}; virtual void setNumSeqs(int i) { numSeqs = i; } virtual vector getSequencesWithKmer(int){ vector filler; return filler; }; + virtual int getReversed(int) { return 0; } virtual int getMaxKmer(){ return 1; } protected: diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index 3fa622f..3541e00 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -125,9 +125,15 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { } oldNameMapFName = validParameter.validFile(parameters, "name", true); - if (oldNameMapFName == "not open") { abort = true; } + if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; } else if (oldNameMapFName == "not found"){ oldNameMapFName = ""; } else { m->setNameFile(oldNameMapFName); } + + if (oldNameMapFName == "") { + vector files; files.push_back(inFastaName); + parser.getNameFile(files); + } + } } diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp index 0c42b7d..432894b 100644 --- a/getgroupscommand.cpp +++ b/getgroupscommand.cpp @@ -16,12 +16,12 @@ //********************************************************************************************************************** vector GetGroupsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pgroup); - CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -180,12 +180,12 @@ GetGroupsCommand::GetGroupsCommand(string option) { else { m->setAccnosFile(accnosfile); } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -200,7 +200,7 @@ GetGroupsCommand::GetGroupsCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } @@ -253,6 +253,10 @@ GetGroupsCommand::GetGroupsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared or list."); m->mothurOutEndLine(); abort = true; } if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index b4e41e5..c2be580 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -168,12 +168,12 @@ GetLineageCommand::GetLineageCommand(string option) { //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -192,7 +192,7 @@ GetLineageCommand::GetLineageCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = m->getTaxonomyFile(); if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } @@ -217,6 +217,11 @@ GetLineageCommand::GetLineageCommand(string option) { m->splitAtChar(taxons, listOfTaxons, '-'); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index ffe49b4..9bf188d 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -206,12 +206,12 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (accnosfile2 == "not found") { accnosfile2 = ""; } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -230,7 +230,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } @@ -249,6 +249,11 @@ GetSeqsCommand::GetSeqsCommand(string option) { dups = m->isTrue(temp); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/groupmap.h b/groupmap.h index 99891a4..29c1741 100644 --- a/groupmap.h +++ b/groupmap.h @@ -2,7 +2,7 @@ #define GROUPMAP_H /* * groupmap.h - * Dotur + * Mothur * * Created by Sarah Westcott on 12/1/08. * Copyright 2008 Schloss Lab UMASS Amherst. All rights reserved. diff --git a/kmer.cpp b/kmer.cpp index 6ee84d3..50574f4 100644 --- a/kmer.cpp +++ b/kmer.cpp @@ -131,6 +131,27 @@ string Kmer::getKmerBases(int kmerNumber){ } return kmer; } +/**************************************************************************************************/ + +int Kmer::getReverseKmerNumber(int kmerNumber){ + + string kmerString = getKmerBases(kmerNumber); + + //get Reverse + string reverse = ""; + for(int i=kmerString.length()-1;i>=0;i--){ + if(kmerString[i] == 'A') { reverse += 'T'; } + else if(kmerString[i] == 'T'){ reverse += 'A'; } + else if(kmerString[i] == 'G'){ reverse += 'C'; } + else if(kmerString[i] == 'C'){ reverse += 'G'; } + else { reverse += 'N'; } + } + + int reverseNumber = getKmerNumber(reverse, 0); + + return reverseNumber; + +} /**************************************************************************************************/ diff --git a/kmer.hpp b/kmer.hpp index d5b67d6..9a3c5a0 100644 --- a/kmer.hpp +++ b/kmer.hpp @@ -21,6 +21,7 @@ public: string getKmerString(string); int getKmerNumber(string, int); string getKmerBases(int); + int getReverseKmerNumber(int); vector< map > getKmerCounts(string sequence); //for use in chimeraCheck private: diff --git a/kmerdb.cpp b/kmerdb.cpp index 2703e16..9a5d235 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -57,6 +57,8 @@ KmerDB::~KmerDB(){} vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ try { + if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; } + vector topMatches; Kmer kmer(kmerSize); searchScore = 0; @@ -215,6 +217,20 @@ int KmerDB::getCount(int kmer) { } } /**************************************************************************************************/ +int KmerDB::getReversed(int kmerNumber) { + try { + Kmer kmer(kmerSize); + + if (kmerNumber < 0) { return 0; } //if user gives negative number + else if (kmerNumber > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer + else { return kmer.getReverseKmerNumber(kmerNumber); } // kmer is in vector range + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "getReversed"); + exit(1); + } +} +/**************************************************************************************************/ vector KmerDB::getSequencesWithKmer(int kmer) { try { diff --git a/kmerdb.hpp b/kmerdb.hpp index 62d4836..4ae00b9 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -36,6 +36,7 @@ public: void readKmerDB(ifstream&); int getCount(int); //returns number of sequences with that kmer number vector getSequencesWithKmer(int); //returns vector of sequences that contain kmer passed in + int getReversed(int); //returns reverse compliment kmerNumber int getMaxKmer() { return maxKmer; } private: diff --git a/knn.cpp b/knn.cpp index 6053b6e..837fa6d 100644 --- a/knn.cpp +++ b/knn.cpp @@ -72,11 +72,11 @@ string Knn::getTaxonomy(Sequence* seq) { } if (closestNames.size() == 0) { - m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); m->mothurOutEndLine(); - tax = "bad seq"; + m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine(); + tax = "unknown;"; }else{ tax = findCommonTaxonomy(closestNames); - if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); m->mothurOutEndLine(); tax = "bad seq"; } + if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; } } simpleTax = tax; @@ -90,7 +90,7 @@ string Knn::getTaxonomy(Sequence* seq) { /**************************************************************************************************/ string Knn::findCommonTaxonomy(vector closest) { try { - vector< vector > taxons; //taxon[0] = vector of taxonomy info for closest[0]. + /*vector< vector > taxons; //taxon[0] = vector of taxonomy info for closest[0]. //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea; //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria.... @@ -101,6 +101,7 @@ string Knn::findCommonTaxonomy(vector closest) { if (m->control_pressed) { return "control"; } string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy + cout << tax << endl; taxons[i] = parseTax(tax); @@ -128,9 +129,54 @@ string Knn::findCommonTaxonomy(vector closest) { } break; } + }*/ + + string conTax; + + //create a tree containing sequences from this bin + PhyloTree* p = new PhyloTree(); + + for (int i = 0; i < closest.size(); i++) { + p->addSeqToTree(closest[i], taxonomy[closest[i]]); } - - return common; + + //build tree + p->assignHeirarchyIDs(0); + + TaxNode currentNode = p->get(0); + + //at each level + while (currentNode.children.size() != 0) { //you still have more to explore + + TaxNode bestChild; + int bestChildSize = 0; + + //go through children + for (map::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) { + + TaxNode temp = p->get(itChild->second); + + //select child with largest accessions - most seqs assigned to it + if (temp.accessions.size() > bestChildSize) { + bestChild = p->get(itChild->second); + bestChildSize = temp.accessions.size(); + } + + } + + if (bestChildSize == closest.size()) { //if yes, add it + conTax += bestChild.name + ";"; + }else{ //if no, quit + break; + } + + //move down a level + currentNode = bestChild; + } + + delete p; + + return conTax; } catch(exception& e) { m->errorOut(e, "Knn", "findCommonTaxonomy"); diff --git a/makefile b/makefile index e2e66c1..476070e 100644 --- a/makefile +++ b/makefile @@ -32,6 +32,7 @@ ifeq ($(strip $(64BIT_VERSION)),yes) #if you using cygwin to build Windows the following line #CXX = x86_64-w64-mingw32-g++ #CC = x86_64-w64-mingw32-g++ + #FORTAN_COMPILER = x86_64-w64-mingw32-gfortran #TARGET_ARCH += -m64 -static #if you are a linux user use the following line diff --git a/memchi2.cpp b/memchi2.cpp index 595b54f..f08b4b8 100644 --- a/memchi2.cpp +++ b/memchi2.cpp @@ -17,7 +17,7 @@ EstOutput MemChi2::getValues(vector shared) { int nonZeroA = 0; int nonZeroB = 0; int totalOtus = shared[0]->getNumBins(); - int totalGroups = shared.size(); + //int totalGroups = shared.size(); //for each otu for (int i = 0; i < shared[0]->getNumBins(); i++) { diff --git a/mergegroupscommand.cpp b/mergegroupscommand.cpp index af2af96..e30558c 100644 --- a/mergegroupscommand.cpp +++ b/mergegroupscommand.cpp @@ -13,7 +13,8 @@ //********************************************************************************************************************** vector MergeGroupsCommand::setParameters(){ try { - CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pgroup); CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); @@ -33,12 +34,12 @@ vector MergeGroupsCommand::setParameters(){ string MergeGroupsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The merge.groups command reads a shared file and a design file and merges the groups in the shared file that are in the same grouping in the design file.\n"; + helpString += "The merge.groups command reads a shared or group file and a design file and merges the groups that are in the same grouping in the design file.\n"; helpString += "The merge.groups command outputs a .shared file. \n"; - helpString += "The merge.groups command parameters are shared, groups, label and design. The design and shared parameter are required.\n"; + helpString += "The merge.groups command parameters are shared, group, groups, label and design. The design parameter is required.\n"; helpString += "The design parameter allows you to assign your groups to sets. It is required. \n"; helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the group name and the second column is the set the group belongs to.\n"; - helpString += "The groups parameter allows you to specify which of the groups in your shared you would like included. The group names are separated by dashes.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your shared or group file you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; helpString += "The merge.groups command should be in the following format: merge.groups(design=yourDesignFile, shared=yourSharedFile).\n"; helpString += "Example merge.groups(design=temp.design, groups=A-B-C, shared=temp.shared).\n"; @@ -58,6 +59,7 @@ MergeGroupsCommand::MergeGroupsCommand(){ setParameters(); vector tempOutNames; outputTypes["shared"] = tempOutNames; + outputTypes["group"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "MergeGroupsCommand", "MetaStatsCommand"); @@ -92,6 +94,7 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["shared"] = tempOutNames; + outputTypes["group"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -116,6 +119,15 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + } //check for required parameters @@ -128,15 +140,15 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; } }else { m->setDesignFile(designfile); } - //make sure the user has already run the read.otu command sharedfile = validParameter.validFile(parameters, "shared", true); - if (sharedfile == "not open") { abort = true; } - else if (sharedfile == "not found") { - //if there is a current shared file, use it - sharedfile = m->getSharedFile(); - if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; } - }else { m->setSharedFile(sharedfile); } + if (sharedfile == "not open") { abort = true; sharedfile = ""; } + else if (sharedfile == "not found") { sharedfile = ""; } + else { m->setSharedFile(sharedfile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; groupfile = ""; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -151,6 +163,19 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { if (groups == "not found") { groups = "all"; } m->splitAtDash(groups, Groups); m->setGroups(Groups); + + if ((sharedfile == "") && (groupfile == "")) { + //give priority to group, then shared + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } + } + } } } @@ -165,17 +190,106 @@ int MergeGroupsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } + + designMap = new GroupMap(designfile); + designMap->readDesignMap(); + + if (groupfile != "") { processGroupFile(designMap); } + if (sharedfile != "") { processSharedFile(designMap); } + + //reset groups parameter + m->clearGroups(); + delete designMap; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;} + + + //set shared file as new current sharedfile + string current = ""; + itTypes = outputTypes.find("shared"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } + } + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MergeGroupsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int MergeGroupsCommand::process(vector& thisLookUp, ofstream& out){ + try { + + map merged; + map::iterator it; + + for (int i = 0; i < thisLookUp.size(); i++) { + + if (m->control_pressed) { return 0; } + + //what grouping does this group belong to + string grouping = designMap->getGroup(thisLookUp[i]->getGroup()); + if (grouping == "not found") { m->mothurOut("[ERROR]: " + thisLookUp[i]->getGroup() + " is not in your design file. Ignoring!"); m->mothurOutEndLine(); grouping = "NOTFOUND"; } + + else { + //do we already have a member of this grouping? + it = merged.find(grouping); + + if (it == merged.end()) { //nope, so create it + merged[grouping] = *thisLookUp[i]; + merged[grouping].setGroup(grouping); + }else { //yes, merge it + + for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { + int abund = (it->second).getAbundance(j); + abund += thisLookUp[i]->getAbundance(j); + + (it->second).set(j, abund, grouping); + } + } + } + } + + //print new file + for (it = merged.begin(); it != merged.end(); it++) { + out << (it->second).getLabel() << '\t' << it->first << '\t'; + (it->second).print(out); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "MergeGroupsCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** + +int MergeGroupsCommand::processSharedFile(GroupMap*& designMap){ + try { - if (outputDir == "") { outputDir += m->hasPath(sharedfile); } - string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "merge" + m->getExtension(sharedfile); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "merge" + m->getExtension(sharedfile); outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); ofstream out; m->openOutputFile(outputFileName, out); - designMap = new GroupMap(designfile); - designMap->readDesignMap(); - InputData input(sharedfile, "sharedfile"); lookup = input.getSharedRAbundVectors(); string lastLabel = lookup[0]->getLabel(); @@ -256,78 +370,68 @@ int MergeGroupsCommand::execute(){ } out.close(); - //reset groups parameter - m->clearGroups(); - delete designMap; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;} - - - //set shared file as new current sharedfile - string current = ""; - itTypes = outputTypes.find("shared"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } - } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); + return 0; + } catch(exception& e) { - m->errorOut(e, "MergeGroupsCommand", "execute"); + m->errorOut(e, "MergeGroupsCommand", "processSharedFile"); exit(1); } } //********************************************************************************************************************** -int MergeGroupsCommand::process(vector& thisLookUp, ofstream& out){ +int MergeGroupsCommand::processGroupFile(GroupMap*& designMap){ try { - map merged; - map::iterator it; + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "merge" + m->getExtension(groupfile); + outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName); - for (int i = 0; i < thisLookUp.size(); i++) { + ofstream out; + m->openOutputFile(outputFileName, out); + + //read groupfile + GroupMap groupMap(groupfile); + groupMap.readMap(); + + //fill Groups - checks for "all" and for any typo groups + SharedUtil* util = new SharedUtil(); + vector nameGroups = groupMap.getNamesOfGroups(); + util->setGroups(Groups, nameGroups); + delete util; + + vector namesOfSeqs = groupMap.getNamesSeqs(); + bool error = false; + + for (int i = 0; i < namesOfSeqs.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { break; } - //what grouping does this group belong to - string grouping = designMap->getGroup(thisLookUp[i]->getGroup()); - if (grouping == "not found") { m->mothurOut("[ERROR]: " + thisLookUp[i]->getGroup() + " is not in your design file. Ignoring!"); m->mothurOutEndLine(); grouping = "NOTFOUND"; } + string thisGroup = groupMap.getGroup(namesOfSeqs[i]); - else { - //do we already have a member of this grouping? - it = merged.find(grouping); + //are you in a group the user wants + if (m->inUsersGroups(thisGroup, Groups)) { + string thisGrouping = designMap->getGroup(thisGroup); - if (it == merged.end()) { //nope, so create it - merged[grouping] = *thisLookUp[i]; - merged[grouping].setGroup(grouping); - }else { //yes, merge it - - for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { - int abund = (it->second).getAbundance(j); - abund += thisLookUp[i]->getAbundance(j); - - (it->second).set(j, abund, grouping); - } + if (thisGrouping == "not found") { m->mothurOut("[ERROR]: " + namesOfSeqs[i] + " is from group " + thisGroup + " which is not in your design file, please correct."); m->mothurOutEndLine(); error = true; } + else { + out << namesOfSeqs[i] << '\t' << thisGrouping << endl; } } } - //print new file - for (it = merged.begin(); it != merged.end(); it++) { - out << (it->second).getLabel() << '\t' << it->first << '\t'; - (it->second).print(out); - } + if (error) { m->control_pressed = true; } + + out.close(); return 0; } catch(exception& e) { - m->errorOut(e, "MergeGroupsCommand", "process"); + m->errorOut(e, "MergeGroupsCommand", "processGroupFile"); exit(1); } } diff --git a/mergegroupscommand.h b/mergegroupscommand.h index 7907dfe..7f216eb 100644 --- a/mergegroupscommand.h +++ b/mergegroupscommand.h @@ -38,10 +38,12 @@ private: bool abort, allLines, pickedGroups; set labels; //holds labels to be used - string groups, label, outputDir, inputDir, designfile, sharedfile; + string groups, label, outputDir, inputDir, designfile, sharedfile, groupfile; vector Groups, outputNames; int process(vector&, ofstream&); + int processSharedFile(GroupMap*&); + int processGroupFile(GroupMap*&); }; #endif diff --git a/mothur.h b/mothur.h index 3b7c459..50344e2 100644 --- a/mothur.h +++ b/mothur.h @@ -106,6 +106,19 @@ struct ThreadNode { IntNode* right; }; +struct diffPair { + float prob; + float reverseProb; + + diffPair() { + prob = 0; reverseProb = 0; + } + diffPair(float p, float rp) { + prob = p; + reverseProb = rp; + } +}; + /************************************************************/ struct clusterNode { int numSeq; diff --git a/mothurout.cpp b/mothurout.cpp index 9dcdc80..20a7b52 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2152,7 +2152,7 @@ int MothurOut::removeConfidences(string& tax) { int pos2 = taxon.find_last_of(')'); if (pos2 != -1) { string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1))); - if (isContainingOnlyDigits(confidenceScore)) { + if (isNumeric1(confidenceScore)) { taxon = taxon.substr(0, pos); //rip off confidence } } diff --git a/optionparser.cpp b/optionparser.cpp index 71714f1..1e8c445 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -104,7 +104,7 @@ map OptionParser::getParameters() { } }else{ it++; } } - + return parameters; } catch(exception& e) { @@ -113,4 +113,54 @@ map OptionParser::getParameters() { } } +/***********************************************************************/ +//pass a vector of filenames that may match the current namefile. +//this function will look at each one, if the rootnames match, mothur will warn +//the user that they may have neglected to provide a namefile. +//stops when it finds a match. +bool OptionParser::getNameFile(vector files) { + try { + string namefile = m->getNameFile(); + bool match = false; + + if (namefile != "") { + string temp = m->getRootName(m->getSimpleName(namefile)); + vector rootName; + m->splitAtChar(temp, rootName, '.'); + + for (int i = 0; i < files.size(); i++) { + temp = m->getRootName(m->getSimpleName(files[i])); + vector root; + m->splitAtChar(temp, root, '.'); + + int smallest = rootName.size(); + if (root.size() < smallest) { smallest = root.size(); } + + int numMatches = 0; + for(int j = 0; j < smallest; j++) { + if (root[j] == rootName[j]) { numMatches++; } + } + + if (smallest > 0) { + if ((numMatches >= (smallest-2)) && (root[0] == rootName[0])) { + m->mothurOut("[WARNING]: This command can take a namefile and you did not provide one. The current namefile is " + namefile + " which seems to match " + files[i] + "."); + m->mothurOutEndLine(); + match = true; + break; + } + } + } + + } + + + return match; + } + catch(exception& e) { + m->errorOut(e, "OptionParser", "getNameFile"); + exit(1); + } +} + + /***********************************************************************/ diff --git a/optionparser.h b/optionparser.h index 7707e73..facd1f8 100644 --- a/optionparser.h +++ b/optionparser.h @@ -14,7 +14,7 @@ #include "mothur.h" #include "mothurout.h" - +#include "command.hpp" /***********************************************************************/ @@ -23,6 +23,7 @@ public: OptionParser(string); ~OptionParser() {} map getParameters(); + bool getNameFile(vector); private: map parameters; MothurOut* m; diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 9cba70b..9509a20 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -14,6 +14,8 @@ vector ParseFastaQCommand::setParameters(){ try { CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfastq); + CommandParameter pfasta("fasta", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pfasta); + CommandParameter pqual("qfile", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pqual); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -104,6 +106,15 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastaQFile); } + + string temp; + temp = validParameter.validFile(parameters, "fasta", false); if(temp == "not found"){ temp = "T"; } + fasta = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "qfile", false); if(temp == "not found"){ temp = "T"; } + qual = m->isTrue(temp); + + if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; } } } @@ -122,13 +133,16 @@ int ParseFastaQCommand::execute(){ string fastaFile = outputDir + m->getRootName(m->getSimpleName(fastaQFile)) + "fasta"; string qualFile = outputDir + m->getRootName(m->getSimpleName(fastaQFile)) + "qual"; ofstream outFasta, outQual; - m->openOutputFile(fastaFile, outFasta); outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile); - m->openOutputFile(qualFile, outQual); outputNames.push_back(qualFile); outputTypes["qfile"].push_back(qualFile); + + if (fasta) { m->openOutputFile(fastaFile, outFasta); outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile); } + if (qual) { m->openOutputFile(qualFile, outQual); outputNames.push_back(qualFile); outputTypes["qfile"].push_back(qualFile); } ifstream in; m->openInputFile(fastaQFile, in); while (!in.eof()) { + + if (m->control_pressed) { break; } //read sequence name string name = m->getline(in); m->gobble(in); @@ -147,27 +161,27 @@ int ParseFastaQCommand::execute(){ else { name2 = name2.substr(1); } //read quality scores - string qual = m->getline(in); m->gobble(in); - if (qual == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } + string quality = m->getline(in); m->gobble(in); + if (quality == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } //sanity check sequence length and number of quality scores match if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } } - if (qual.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(qual.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } - - //convert quality scores - vector qualScores = convertQual(qual); + if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } //print sequence info to files - outFasta << ">" << name << endl << sequence << endl; + if (fasta) { outFasta << ">" << name << endl << sequence << endl; } - outQual << ">" << name << endl; - for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } - outQual << endl; + if (qual) { + vector qualScores = convertQual(quality); + outQual << ">" << name << endl; + for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } + outQual << endl; + } } in.close(); - outFasta.close(); - outQual.close(); + if (fasta) { outFasta.close(); } + if (qual) { outQual.close(); } if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(fastaFile); m->mothurRemove(qualFile); return 0; } diff --git a/parsefastaqcommand.h b/parsefastaqcommand.h index 6052456..6feabce 100644 --- a/parsefastaqcommand.h +++ b/parsefastaqcommand.h @@ -34,7 +34,7 @@ private: vector outputNames; string outputDir, fastaQFile; - bool abort; + bool abort, fasta, qual; vector convertQual(string); }; diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp index ccf2952..2d46efc 100644 --- a/parsimonycommand.cpp +++ b/parsimonycommand.cpp @@ -139,7 +139,7 @@ ParsimonyCommand::ParsimonyCommand(string option) { if (randomtree == "") { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); } @@ -153,7 +153,7 @@ ParsimonyCommand::ParsimonyCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } } @@ -177,6 +177,11 @@ ParsimonyCommand::ParsimonyCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index 69c7b30..2b15d11 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -144,7 +144,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); @@ -159,7 +159,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -197,6 +197,11 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { } if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } } diff --git a/phylosummary.cpp b/phylosummary.cpp index 22e8e75..7d9fcac 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -121,6 +121,7 @@ string PhyloSummary::getNextTaxon(string& heirarchy){ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ try { + numSeqs++; map::iterator childPointer; diff --git a/phylotree.cpp b/phylotree.cpp index e430fb9..dba1e3b 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -20,6 +20,7 @@ PhyloTree::PhyloTree(){ tree[0].heirarchyID = "0"; maxLevel = 0; calcTotals = true; + addSeqToTree("unknown", "unknown;"); } catch(exception& e) { m->errorOut(e, "PhyloTree", "PhyloTree"); @@ -127,6 +128,7 @@ PhyloTree::PhyloTree(string tfile){ maxLevel = 0; calcTotals = true; string name, tax; + addSeqToTree("unknown", "unknown;"); #ifdef USE_MPI @@ -232,7 +234,6 @@ string PhyloTree::getNextTaxon(string& heirarchy, string seqname){ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ try { - numSeqs++; map::iterator childPointer; @@ -375,7 +376,7 @@ void PhyloTree::binUnclassified(string file){ map::iterator childPointer; vector copy = tree; - + //fill out tree fillOutTree(0, copy); @@ -484,16 +485,16 @@ string PhyloTree::getFullTaxonomy(string seqName) { void PhyloTree::print(ofstream& out, vector& copy){ try { - + //output mothur version out << "#" << m->getVersion() << endl; out << copy.size() << endl; out << maxLevel << endl; - + for (int i = 0; i < copy.size(); i++) { - + out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; map::iterator it; @@ -606,6 +607,7 @@ bool PhyloTree::ErrorCheck(vector templateFileNames){ try { bool okay = true; + templateFileNames.push_back("unknown"); map::iterator itFind; map taxonomyFileNames = name2Taxonomy; diff --git a/phylotypecommand.cpp b/phylotypecommand.cpp index 00f960b..1d6c391 100644 --- a/phylotypecommand.cpp +++ b/phylotypecommand.cpp @@ -128,11 +128,11 @@ PhylotypeCommand::PhylotypeCommand(string option) { m->mothurOut("No valid current files. taxonomy is a required parameter."); m->mothurOutEndLine(); abort = true; } - }else if (taxonomyFileName == "not open") { abort = true; } + }else if (taxonomyFileName == "not open") { taxonomyFileName = ""; abort = true; } else { m->setTaxonomyFile(taxonomyFileName); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { readNamesFile(); m->setNameFile(namefile); } @@ -153,6 +153,11 @@ PhylotypeCommand::PhylotypeCommand(string option) { else { allLines = 1; } } + if (namefile == "") { + vector files; files.push_back(taxonomyFileName); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 21de034..582da49 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -147,7 +147,7 @@ PreClusterCommand::PreClusterCommand(string option) { // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not found") { namefile = ""; } - else if (namefile == "not open") { abort = true; } + else if (namefile == "not open") { namefile = ""; abort = true; } else { m->setNameFile(namefile); } groupfile = validParameter.validFile(parameters, "group", true); @@ -162,7 +162,10 @@ PreClusterCommand::PreClusterCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/referencedb.cpp b/referencedb.cpp index fb930ef..36400c1 100644 --- a/referencedb.cpp +++ b/referencedb.cpp @@ -22,6 +22,7 @@ void ReferenceDB::clearMemory() { setSavedReference(""); for(int i = 0; i < wordGenusProb.size(); i++) { wordGenusProb[i].clear(); } wordGenusProb.clear(); + WordPairDiffArr.clear(); setSavedTaxonomy(""); } /******************************************************* diff --git a/referencedb.h b/referencedb.h index 2f29247..5262e80 100644 --- a/referencedb.h +++ b/referencedb.h @@ -26,6 +26,7 @@ class ReferenceDB { bool save; vector referenceSeqs; vector< vector > wordGenusProb; + vector WordPairDiffArr; string getSavedReference() { return referencefile; } void setSavedReference(string p) { referencefile = p; } diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp index e5a3bb2..f35da2e 100644 --- a/removegroupscommand.cpp +++ b/removegroupscommand.cpp @@ -16,12 +16,12 @@ //********************************************************************************************************************** vector RemoveGroupsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pgroup); - CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -253,6 +253,12 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared or list."); m->mothurOutEndLine(); abort = true; } if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index f55158f..9fec249 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -169,12 +169,12 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -193,7 +193,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = m->getTaxonomyFile(); if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } @@ -220,7 +220,12 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } - + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 59c9b34..2abe9ef 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -194,12 +194,12 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { }else { m->setAccnosFile(accnosfile); } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -237,6 +237,10 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 16618f7..4106739 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -260,6 +260,11 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index fe80b78..24142e0 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -122,7 +122,7 @@ AlignCheckCommand::AlignCheckCommand(string option) { else if (mapfile == "not found") { mapfile = ""; m->mothurOut("You must provide an map file."); m->mothurOutEndLine(); abort = true; } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = m->getFastaFile(); if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } @@ -139,7 +139,11 @@ AlignCheckCommand::AlignCheckCommand(string option) { outputDir = ""; outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it } - + + if ((namefile == "") && (fastafile != "")){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 9ac7c5b..d8ebe50 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -176,7 +176,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } } - else if (queryFileName == "not open") { abort = true; } + else if (queryFileName == "not open") { queryFileName = ""; abort = true; } else { m->setFastaFile(queryFileName); } referenceFileName = validParameter.validFile(parameters, "reference", true); @@ -246,6 +246,11 @@ SeqErrorCommand::SeqErrorCommand(string option) { substitutionMatrix.resize(6); for(int i=0;i<6;i++){ substitutionMatrix[i].resize(6,0); } + + if ((namesFileName == "") && (queryFileName != "")){ + vector files; files.push_back(queryFileName); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 93950d1..e8f73ca 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -132,8 +132,12 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); - - + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/sequence.cpp b/sequence.cpp index 162d3be..090ee14 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -604,7 +604,6 @@ void Sequence::padFromPos(int end){ bool Sequence::getIsAligned(){ return isAligned; } - //******************************************************************************************************************** void Sequence::reverseComplement(){ diff --git a/setdircommand.cpp b/setdircommand.cpp index 17bad60..67a1f59 100644 --- a/setdircommand.cpp +++ b/setdircommand.cpp @@ -202,8 +202,19 @@ int SetDirectoryCommand::execute(){ if (lastChar != "\\") { tempdefault += "\\"; } #endif - m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine(); - m->setDefaultPath(tempdefault); + //test to make sure directory exists + tempdefault = m->getFullPathName(tempdefault); + string inTemp = tempdefault + tag + "temp"; + ofstream in; + in.open(inTemp.c_str(), ios::trunc); + if(!in) { + m->mothurOut(tempdefault + " directory does not exist or is not writable."); m->mothurOutEndLine(); + }else{ + in.close(); + m->mothurRemove(inTemp); + m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine(); + m->setDefaultPath(tempdefault); + } } return 0; diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 40a4c84..5378211 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -23,7 +23,7 @@ vector ShhherCommand::setParameters(){ try { CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow); CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile); - CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plookup); + CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup); CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter); @@ -2152,7 +2152,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string qualityFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual"; + string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.qual"; ofstream qualityFile; m->openOutputFile(qualityFileName, qualityFile); @@ -2259,7 +2259,7 @@ void ShhherCommand::writeSequences(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fastaFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta"; + string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.fasta"; ofstream fastaFile; m->openOutputFile(fastaFileName, fastaFile); @@ -2307,7 +2307,7 @@ void ShhherCommand::writeNames(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string nameFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; + string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.names"; ofstream nameFile; m->openOutputFile(nameFileName, nameFile); @@ -2345,7 +2345,7 @@ void ShhherCommand::writeGroups(){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fileRoot = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')); + string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)); string groupFileName = fileRoot + ".shhh.groups"; ofstream groupFile; m->openOutputFile(groupFileName, groupFile); @@ -2370,7 +2370,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string otuCountsFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts"; + string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.counts"; ofstream otuCountsFile; m->openOutputFile(otuCountsFileName, otuCountsFile); diff --git a/shhhseqscommand.cpp b/shhhseqscommand.cpp index 6a339b8..518e78d 100644 --- a/shhhseqscommand.cpp +++ b/shhhseqscommand.cpp @@ -165,6 +165,11 @@ ShhhSeqsCommand::ShhhSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/splitabundcommand.cpp b/splitabundcommand.cpp index 989ef59..8131fa9 100644 --- a/splitabundcommand.cpp +++ b/splitabundcommand.cpp @@ -216,7 +216,6 @@ SplitAbundCommand::SplitAbundCommand(string option) { m->mothurConvert(temp, cutoff); if (cutoff == 0) { m->mothurOut("You must provide a cutoff to qualify what is abundant for the split.abund command. "); m->mothurOutEndLine(); abort = true; } - } } diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp index 113439c..2671655 100644 --- a/splitgroupscommand.cpp +++ b/splitgroupscommand.cpp @@ -125,7 +125,7 @@ SplitGroupCommand::SplitGroupCommand(string option) { namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -151,6 +151,11 @@ SplitGroupCommand::SplitGroupCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(groupfile); } + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index fe9e934..c352feb 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -255,6 +255,10 @@ SubSampleCommand::SubSampleCommand(string option) { if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/summaryqualcommand.cpp b/summaryqualcommand.cpp index 421a8f2..2a40e08 100644 --- a/summaryqualcommand.cpp +++ b/summaryqualcommand.cpp @@ -111,7 +111,7 @@ SummaryQualCommand::SummaryQualCommand(string option) { //check for required parameters qualfile = validParameter.validFile(parameters, "qfile", true); - if (qualfile == "not open") { abort = true; } + if (qualfile == "not open") { qualfile = ""; abort = true; } else if (qualfile == "not found") { qualfile = m->getQualFile(); if (qualfile != "") { m->mothurOut("Using " + qualfile + " as input file for the qfile parameter."); m->mothurOutEndLine(); } @@ -131,7 +131,12 @@ SummaryQualCommand::SummaryQualCommand(string option) { string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - m->mothurConvert(temp, processors); + m->mothurConvert(temp, processors); + + if (namefile == "") { + vector files; files.push_back(qualfile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp index a19986a..ffa3041 100644 --- a/summarytaxcommand.cpp +++ b/summarytaxcommand.cpp @@ -157,6 +157,12 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { outputDir = ""; outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it } + + if (namefile == "") { + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 213e241..dd3427b 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -292,6 +292,11 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } + + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } } } diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 047d06e..a404f79 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -155,7 +155,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -198,6 +198,11 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { m->splitAtDash(groups, Groups); m->setGroups(Groups); } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index b007db1..f9dc450 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -141,7 +141,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); } @@ -155,7 +155,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -192,6 +192,11 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { m->mothurConvert(temp, processors); if (!random) { iters = 0; } //turn off random calcs + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } }