From 16abd6271c455bd01b34ff89a2e3641bef0fa128 Mon Sep 17 00:00:00 2001 From: westcott Date: Thu, 19 Jan 2012 12:44:14 +0000 Subject: [PATCH] fixed bug with shhh.flow from file path name in write functions, added "smart" feature with namefile, added fasta and qfile parameters to fastq.info, added checking for reverse sequences in classifier. --- Mothur.xcodeproj/project.pbxproj | 4 +- bayesian.cpp | 129 +++++++++++++---- bayesian.h | 7 +- binsequencecommand.cpp | 7 +- classify.cpp | 12 +- classify.h | 2 + classifyotucommand.cpp | 7 +- classifyseqscommand.cpp | 151 +++++++++++++------- classifyseqscommand.h | 46 +++--- clusterfragmentscommand.cpp | 7 +- clustersplitcommand.cpp | 2 +- consensusseqscommand.cpp | 8 +- database.hpp | 1 + deconvolutecommand.cpp | 8 +- getgroupscommand.cpp | 22 +-- getlineagecommand.cpp | 11 +- getseqscommand.cpp | 11 +- groupmap.h | 2 +- kmer.cpp | 21 +++ kmer.hpp | 1 + kmerdb.cpp | 16 +++ kmerdb.hpp | 1 + knn.cpp | 58 +++++++- makefile | 1 + memchi2.cpp | 2 +- mergegroupscommand.cpp | 232 ++++++++++++++++++++++--------- mergegroupscommand.h | 4 +- mothur.h | 13 ++ mothurout.cpp | 2 +- optionparser.cpp | 52 ++++++- optionparser.h | 3 +- parsefastaqcommand.cpp | 42 ++++-- parsefastaqcommand.h | 2 +- parsimonycommand.cpp | 9 +- phylodiversitycommand.cpp | 9 +- phylosummary.cpp | 1 + phylotree.cpp | 12 +- phylotypecommand.cpp | 9 +- preclustercommand.cpp | 7 +- referencedb.cpp | 1 + referencedb.h | 1 + removegroupscommand.cpp | 18 ++- removelineagecommand.cpp | 13 +- removeseqscommand.cpp | 8 +- screenseqscommand.cpp | 5 + secondarystructurecommand.cpp | 8 +- seqerrorcommand.cpp | 7 +- seqsummarycommand.cpp | 8 +- sequence.cpp | 1 - setdircommand.cpp | 15 +- shhhercommand.cpp | 12 +- shhhseqscommand.cpp | 5 + splitabundcommand.cpp | 1 - splitgroupscommand.cpp | 7 +- subsamplecommand.cpp | 4 + summaryqualcommand.cpp | 9 +- summarytaxcommand.cpp | 6 + trimseqscommand.cpp | 5 + unifracunweightedcommand.cpp | 7 +- unifracweightedcommand.cpp | 9 +- 60 files changed, 821 insertions(+), 263 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c7c7cee..99454a9 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -2316,8 +2316,8 @@ GCC_OPTIMIZATION_LEVEL = 3; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../release\\\"\"", - "VERSION=\"\\\"1.22.0\\\"\"", - "RELEASE_DATE=\"\\\"10/12/2011\\\"\"", + "VERSION=\"\\\"1.23.0\\\"\"", + "RELEASE_DATE=\"\\\"1/9/2012\\\"\"", ); "GCC_VERSION[arch=*]" = ""; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; diff --git a/bayesian.cpp b/bayesian.cpp index b1f2c4c..f7ea6e4 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -12,12 +12,13 @@ #include "phylosummary.h" #include "referencedb.h" /**************************************************************************************************/ -Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid) : -Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { +Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid, bool f) : +Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { ReferenceDB* rdb = ReferenceDB::getInstance(); threadID = tid; + flip = f; string baseName = tempFile; if (baseName == "saved") { baseName = rdb->getSavedReference(); } @@ -78,13 +79,14 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (tfile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory... "); cout.flush();; wordGenusProb = rdb->wordGenusProb; + WordPairDiffArr = rdb->WordPairDiffArr; }else { m->mothurOut("Reading template probabilities... "); cout.flush(); readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); } //save probabilities - if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } }else{ //create search database and names vector @@ -108,11 +110,12 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //initialze probabilities wordGenusProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //cout << numKmers << '\t' << genusNodes.size() << endl; for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } //cout << numKmers << '\t' << genusNodes.size() << endl; - //ofstream out; - //ofstream out2; + ofstream out; + ofstream out2; #ifdef USE_MPI int pid; @@ -122,17 +125,17 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { #endif - //m->openOutputFile(probFileName, out); + m->openOutputFile(probFileName, out); //output mothur version - //out << "#" << m->getVersion() << endl; + out << "#" << m->getVersion() << endl; - //out << numKmers << endl; + out << numKmers << endl; - //m->openOutputFile(probFileName2, out2); + m->openOutputFile(probFileName2, out2); //output mothur version - //out2 << "#" << m->getVersion() << endl; + out2 << "#" << m->getVersion() << endl; #ifdef USE_MPI } @@ -149,7 +152,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << i << '\t'; + out << i << '\t'; #ifdef USE_MPI } @@ -168,7 +171,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //probabilityInTemplate = (# of seqs with that word in template + 0.50) / (total number of seqs in template + 1); float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1); - + diffPair tempProb(log(probabilityInTemplate), 0.0); + WordPairDiffArr[i] = tempProb; + int numNotZero = 0; for (int k = 0; k < genusNodes.size(); k++) { //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); @@ -184,7 +189,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << k << '\t' << wordGenusProb[i][k] << '\t'; + out << k << '\t' << wordGenusProb[i][k] << '\t' ; #ifdef USE_MPI } @@ -200,8 +205,8 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out << endl; - //out2 << probabilityInTemplate << '\t' << numNotZero << endl; + out << endl; + out2 << probabilityInTemplate << '\t' << numNotZero << '\t' << log(probabilityInTemplate) << endl; #ifdef USE_MPI } @@ -214,8 +219,8 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - //out.close(); - //out2.close(); + out.close(); + out2.close(); #ifdef USE_MPI } @@ -228,10 +233,15 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); //save probabilities - if (rdb->save) { rdb->wordGenusProb = wordGenusProb; } + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } } } - + + generateWordPairDiffArr(); + + //save probabilities + if (rdb->save) { rdb->wordGenusProb = wordGenusProb; rdb->WordPairDiffArr = WordPairDiffArr; } + m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine(); } @@ -258,13 +268,13 @@ string Bayesian::getTaxonomy(Sequence* seq) { try { string tax = ""; Kmer kmer(kmerSize); + flipped = false; //get words contained in query //getKmerString returns a string where the index in the string is hte kmer number //and the character at that index can be converted to be the number of times that kmer was seen - string queryKmerString = kmer.getKmerString(seq->getUnaligned()); - + vector queryKmers; for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it if (queryKmerString[i] != '!') { //this kmer is in the query @@ -272,7 +282,22 @@ string Bayesian::getTaxonomy(Sequence* seq) { } } - if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; } + //if user wants to test reverse compliment and its reversed use that instead + if (flip) { + if (isReversed(queryKmers)) { + flipped = true; + seq->reverseComplement(); + queryKmerString = kmer.getKmerString(seq->getUnaligned()); + queryKmers.clear(); + for (int i = 0; i < queryKmerString.length()-1; i++) { // the -1 is to ignore any kmer with an N in it + if (queryKmerString[i] != '!') { //this kmer is in the query + queryKmers.push_back(i); + } + } + } + } + + if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); simpleTax = "unknown;"; return "unknown;"; } int index = getMostProbableTaxonomy(queryKmers); @@ -283,7 +308,7 @@ string Bayesian::getTaxonomy(Sequence* seq) { int numToSelect = queryKmers.size() / 8; tax = bootstrapResults(queryKmers, index, numToSelect); - + return tax; } catch(exception& e) { @@ -366,7 +391,8 @@ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { seqTax = phyloTree->get(seqTax.parent); } - if (confidenceTax == "") { confidenceTax = "unclassified;"; simpleTax = "unclassified;"; } + if (confidenceTax == "") { confidenceTax = "unknown;"; simpleTax = "unknown;"; } + return confidenceTax; } @@ -412,6 +438,46 @@ int Bayesian::getMostProbableTaxonomy(vector queryKmer) { exit(1); } } +//******************************************************************************************************************** +//if it is more probable that the reverse compliment kmers are in the template, then we assume the sequence is reversed. +bool Bayesian::isReversed(vector& queryKmers){ + try{ + bool reversed = false; + float prob = 0; + float reverseProb = 0; + + for (int i = 0; i < queryKmers.size(); i++){ + int kmer = queryKmers[i]; + if (kmer >= 0){ + prob += WordPairDiffArr[kmer].prob; + reverseProb += WordPairDiffArr[kmer].reverseProb; + } + } + + if (reverseProb > prob){ reversed = true; } + + return reversed; + } + catch(exception& e) { + m->errorOut(e, "Bayesian", "isReversed"); + exit(1); + } +} +//******************************************************************************************************************** +int Bayesian::generateWordPairDiffArr(){ + try{ + Kmer kmer(kmerSize); + for (int i = 0; i < WordPairDiffArr.size(); i++) { + int reversedWord = kmer.getReverseKmerNumber(i); + WordPairDiffArr[i].reverseProb = WordPairDiffArr[reversedWord].prob; + } + + return 0; + }catch(exception& e) { + m->errorOut(e, "Bayesian", "generateWordPairDiffArr"); + exit(1); + } +} /************************************************************************************************* map Bayesian::parseTaxMap(string newTax) { try{ @@ -515,7 +581,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string int kmer, name; vector numbers; numbers.resize(numKmers); float prob; - vector zeroCountProb; zeroCountProb.resize(numKmers); + vector zeroCountProb; zeroCountProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //read version length = positions[1] - positions[0]; @@ -537,7 +604,10 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string delete buf4; istringstream iss (tempBuf,istringstream::in); - iss >> zeroCountProb[i] >> numbers[i]; + float probTemp; + iss >> zeroCountProb[i] >> numbers[i] >> probTemp; + WordPairDiffArr[i].prob = tempProb; + } MPI_File_close(&inMPI); @@ -585,13 +655,16 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string int kmer, name, count; count = 0; vector num; num.resize(numKmers); float prob; - vector zeroCountProb; zeroCountProb.resize(numKmers); + vector zeroCountProb; zeroCountProb.resize(numKmers); + WordPairDiffArr.resize(numKmers); //read version string line2 = m->getline(inNum); m->gobble(inNum); + float probTemp; //cout << threadID << '\t' << line2 << '\t' << this << endl; while (inNum) { - inNum >> zeroCountProb[count] >> num[count]; + inNum >> zeroCountProb[count] >> num[count] >> probTemp; + WordPairDiffArr[count].prob = probTemp; count++; m->gobble(inNum); //cout << threadID << '\t' << count << endl; diff --git a/bayesian.h b/bayesian.h index 1cf5145..7c88433 100644 --- a/bayesian.h +++ b/bayesian.h @@ -18,7 +18,7 @@ class Bayesian : public Classify { public: - Bayesian(string, string, string, int, int, int, int); + Bayesian(string, string, string, int, int, int, int, bool); ~Bayesian(); string getTaxonomy(Sequence*); @@ -30,12 +30,17 @@ private: vector genusTotals; vector genusNodes; //indexes in phyloTree where genus' are located + vector WordPairDiffArr; + int kmerSize, numKmers, confidenceThreshold, iters; string bootstrapResults(vector, int, int); int getMostProbableTaxonomy(vector); void readProbFile(ifstream&, ifstream&, string, string); bool checkReleaseDate(ifstream&, ifstream&, ifstream&, ifstream&); + bool isReversed(vector&); + vector createWordIndexArr(Sequence*); + int generateWordPairDiffArr(); }; diff --git a/binsequencecommand.cpp b/binsequencecommand.cpp index 4977cad..2117daf 100644 --- a/binsequencecommand.cpp +++ b/binsequencecommand.cpp @@ -167,7 +167,7 @@ BinSeqCommand::BinSeqCommand(string option) { } namesfile = validParameter.validFile(parameters, "name", true); - if (namesfile == "not open") { abort = true; } + if (namesfile == "not open") { namesfile = ""; abort = true; } else if (namesfile == "not found") { namesfile = ""; } else { m->setNameFile(namesfile); } @@ -176,6 +176,11 @@ BinSeqCommand::BinSeqCommand(string option) { else if (groupfile == "not found") { groupfile = ""; } else { m->setGroupFile(groupfile); } + if (namesfile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/classify.cpp b/classify.cpp index c92d9fd..3bf0d57 100644 --- a/classify.cpp +++ b/classify.cpp @@ -61,7 +61,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me names.push_back(temp.getName()); database->addSequence(temp); } -// database->generateDB(); + database->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); database->readKmerDB(kmerFileTest); @@ -150,7 +150,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } } -// database->generateDB(); + database->generateDB(); MPI_File_close(&inMPI); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else @@ -200,7 +200,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } fastaFile.close(); -// database->generateDB(); + database->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); @@ -223,9 +223,9 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me database->setNumSeqs(names.size()); //sanity check - //bool okay = phyloTree->ErrorCheck(names); + bool okay = phyloTree->ErrorCheck(names); - //if (!okay) { m->control_pressed = true; } + if (!okay) { m->control_pressed = true; } m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); @@ -238,7 +238,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } } /**************************************************************************************************/ -Classify::Classify() { m = MothurOut::getInstance(); database = NULL; } +Classify::Classify() { m = MothurOut::getInstance(); database = NULL; flipped=false; } /**************************************************************************************************/ int Classify::readTaxonomy(string file) { diff --git a/classify.h b/classify.h index 2e209f5..4e03547 100644 --- a/classify.h +++ b/classify.h @@ -30,6 +30,7 @@ public: virtual ~Classify(){}; virtual string getTaxonomy(Sequence*) = 0; virtual string getSimpleTax() { return simpleTax; } + virtual bool getFlipped() { return flipped; } virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float); virtual void setDistName(string s) {} //for knn, so if distance method is selected with knn you can create the smallest distance file in the right place. @@ -45,6 +46,7 @@ protected: string taxFile, templateFile, simpleTax; vector names; int threadID; + bool flip, flipped; int readTaxonomy(string); vector parseTax(string); diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index e28961b..c889637 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -182,7 +182,7 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { else if (refTaxonomy == "not open") { abort = true; } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -214,6 +214,11 @@ ClassifyOtuCommand::ClassifyOtuCommand(string option) { if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } + if (namefile == ""){ + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 9ee7b5e..328cd58 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -27,6 +27,7 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); + //CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); @@ -69,6 +70,7 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n"; @@ -89,6 +91,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ setParameters(); vector tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; } @@ -126,6 +129,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { outputTypes["taxonomy"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -440,15 +444,24 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = m->isTrue(temp); + //temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "T"; } + //flip = m->isTrue(temp); + flip = true; + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); search = "kmer"; } + + if (namefileNames.size() == 0){ + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } + } } @@ -470,12 +483,12 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); } + if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } else { m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); m->mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip); } if (m->control_pressed) { delete classify; return 0; } @@ -494,6 +507,7 @@ int ClassifySeqsCommand::execute(){ if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy"; + string newaccnosFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "flip.accnos"; string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary"; @@ -503,6 +517,7 @@ int ClassifySeqsCommand::execute(){ } outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); + outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); int start = time(NULL); @@ -521,6 +536,7 @@ int ClassifySeqsCommand::execute(){ MPI_File inMPI; MPI_File outMPINewTax; MPI_File outMPITempTax; + MPI_File outMPIAcc; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -530,6 +546,9 @@ int ClassifySeqsCommand::execute(){ char outTempTax[1024]; strcpy(outTempTax, tempTaxonomyFile.c_str()); + + char outAcc[1024]; + strcpy(outAcc, newaccnosFile.c_str()); char inFileName[1024]; strcpy(inFileName, fastaFileNames[s].c_str()); @@ -537,8 +556,9 @@ int ClassifySeqsCommand::execute(){ MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); + MPI_File_open(MPI_COMM_WORLD, outAcc, outMode, MPI_INFO_NULL, &outMPIAcc); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } if (pid == 0) { //you are the root process @@ -557,9 +577,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } for (int i = 1; i < processors; i++) { int done; @@ -577,9 +597,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } int done = 0; MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); @@ -589,6 +609,7 @@ int ClassifySeqsCommand::execute(){ MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); + MPI_File_close(&outMPIAcc); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else @@ -613,16 +634,19 @@ int ClassifySeqsCommand::execute(){ } #endif if(processors == 1){ - numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); }else{ - numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); } #endif + + if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); - + + #ifdef USE_MPI if (pid == 0) { //this part does not need to be paralellized @@ -744,6 +768,12 @@ int ClassifySeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + current = ""; + itTypes = outputTypes.find("accnos"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } + } + delete classify; return 0; @@ -785,7 +815,7 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { /**************************************************************************************************/ -int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { +int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string accnos, string filename) { try { int num = 0; @@ -802,7 +832,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", filename); //pass numSeqs to parent ofstream out; @@ -820,7 +850,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, } //parent does its part - num = driver(lines[0], taxFileName, tempTaxFile, filename); + num = driver(lines[0], taxFileName, tempTaxFile, accnos, filename); //force parent to wait until all the processes are done for (int i=0;istart, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i); + classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flipThreshold); pDataArray.push_back(tempclass); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. @@ -861,7 +891,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, } //parent does its part - num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", filename); + num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", accnos + toString(processors-1) + ".temp", filename); processIDS.push_back((processors-1)); //Wait until all threads have terminated. @@ -879,8 +909,10 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, for(int i=0;imothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp")); m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp")); + m->mothurRemove((m->getFullPathName(accnos) + toString(processIDS[i]) + ".temp")); } return num; @@ -917,13 +949,16 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){ +int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string accnos, string filename){ try { ofstream outTax; m->openOutputFile(taxFName, outTax); ofstream outTaxSimple; m->openOutputFile(tempTFName, outTaxSimple); + + ofstream outAcc; + m->openOutputFile(accnos, outAcc); ifstream inFASTA; m->openInputFile(filename, inFASTA); @@ -936,7 +971,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT int count = 0; while (!done) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { + inFASTA.close(); + outTax.close(); + outTaxSimple.close(); + outAcc.close(); return 0; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); @@ -945,17 +984,20 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT taxonomy = classify->getTaxonomy(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 0; } - - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - }else{ - outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; - } - - outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } + + //output confidence scores or not + if (probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } + + if (classify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + + outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + count++; } delete candidateSeq; @@ -977,6 +1019,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT inFASTA.close(); outTax.close(); outTaxSimple.close(); + outAcc.close(); return count; } @@ -987,10 +1030,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT } //********************************************************************************************************************** #ifdef USE_MPI -int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, MPI_File& accFile, vector& MPIPos){ try { MPI_Status statusNew; MPI_Status statusTemp; + MPI_Status statusAcc; MPI_Status status; int pid; @@ -1018,29 +1062,40 @@ int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; - }else{ - outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - } - - int length = outputString.length(); - char* buf2 = new char[length]; - memcpy(buf2, outputString.c_str(), length); + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } - MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); - delete buf2; - + //output confidence scores or not + if (probs) { + outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; + }else{ outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - length = outputString.length(); - char* buf = new char[length]; - memcpy(buf, outputString.c_str(), length); + } + + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); + delete buf2; + + outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; + length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); - MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); - delete buf; + MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); + delete buf; + + if (classify->getFlipped()) { + outputString = candidateSeq->getName() + "\n"; + length = outputString.length(); + char* buf3 = new char[length]; + memcpy(buf3, outputString.c_str(), length); + + MPI_File_write_shared(accFile, buf3, length, MPI_CHAR, &statusAcc); + delete buf3; } + } delete candidateSeq; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 0e21a20..0bf4a91 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -72,16 +72,16 @@ private: string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; - bool abort, probs, save; + bool abort, probs, save, flip; - int driver(linePair*, string, string, string); + int driver(linePair*, string, string, string, string); void appendTaxFiles(string, string); - int createProcesses(string, string, string); + int createProcesses(string, string, string, string); string addUnclassifieds(string, int); int MPIReadNamesFile(string); #ifdef USE_MPI - int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector&); + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); #endif }; @@ -93,16 +93,17 @@ struct classifyData { string taxFName; string tempTFName; string filename; - string search, taxonomyFileName, templateFileName, method; + string search, taxonomyFileName, templateFileName, method, accnos; unsigned long long start; unsigned long long end; MothurOut* m; float match, misMatch, gapOpen, gapExtend; int count, kmerSize, threadID, cutoff, iters, numWanted; - bool probs; + bool probs, flip; classifyData(){} - classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) { + classifyData(string acc, bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid, bool fli) { + accnos = acc; taxonomyFileName = tx; templateFileName = te; taxFName = a; @@ -124,6 +125,7 @@ struct classifyData { threadID = tid; probs = p; count = 0; + flip = fli; } }; @@ -141,6 +143,9 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ ofstream outTaxSimple; pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple); + ofstream outAcc; + pDataArray->m->openOutputFile(pDataArray->accnos, outAcc); + ifstream inFASTA; pDataArray->m->openInputFile(pDataArray->filename, inFASTA); @@ -157,12 +162,12 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; - if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); } - else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } + if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } + else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID, pDataArray->flipThreshold); } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); - myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID); + myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } if (pDataArray->m->control_pressed) { delete myclassify; return 0; } @@ -180,16 +185,19 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; } - if (taxonomy != "bad seq") { - //output confidence scores or not - if (pDataArray->probs) { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - }else{ - outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; - } - - outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + if (taxonomy == "unknown;") { pDataArray->m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); pDataArray->m->mothurOutEndLine(); } + + //output confidence scores or not + if (pDataArray->probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; } + + outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl; + + if (myclassify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + count++; } delete candidateSeq; diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index ad1bd81..772113b 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -147,7 +147,7 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not found") { namefile = ""; } - else if (namefile == "not open") { abort = true; } + else if (namefile == "not open") { namefile = ""; abort = true; } else { readNameFile(); m->setNameFile(namefile); } string temp; @@ -157,6 +157,11 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { temp = validParameter.validFile(parameters, "percent", false); if (temp == "not found"){ temp = "0"; } m->mothurConvert(temp, percent); + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 787d84a..34caf65 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -201,7 +201,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); } taxFile = validParameter.validFile(parameters, "taxonomy", true); - if (taxFile == "not open") { abort = true; } + if (taxFile == "not open") { taxFile = ""; abort = true; } else if (taxFile == "not found") { taxFile = ""; } else { m->setTaxonomyFile(taxFile); } diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index 4f8c53b..223e5db 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -142,7 +142,7 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { }else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -163,7 +163,11 @@ ConsensusSeqsCommand::ConsensusSeqsCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } - + + if (namefile == ""){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/database.hpp b/database.hpp index 580c3f2..b2817a7 100644 --- a/database.hpp +++ b/database.hpp @@ -58,6 +58,7 @@ public: virtual void readKmerDB(ifstream&){}; virtual void setNumSeqs(int i) { numSeqs = i; } virtual vector getSequencesWithKmer(int){ vector filler; return filler; }; + virtual int getReversed(int) { return 0; } virtual int getMaxKmer(){ return 1; } protected: diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index 3fa622f..3541e00 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -125,9 +125,15 @@ DeconvoluteCommand::DeconvoluteCommand(string option) { } oldNameMapFName = validParameter.validFile(parameters, "name", true); - if (oldNameMapFName == "not open") { abort = true; } + if (oldNameMapFName == "not open") { oldNameMapFName = ""; abort = true; } else if (oldNameMapFName == "not found"){ oldNameMapFName = ""; } else { m->setNameFile(oldNameMapFName); } + + if (oldNameMapFName == "") { + vector files; files.push_back(inFastaName); + parser.getNameFile(files); + } + } } diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp index 0c42b7d..432894b 100644 --- a/getgroupscommand.cpp +++ b/getgroupscommand.cpp @@ -16,12 +16,12 @@ //********************************************************************************************************************** vector GetGroupsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pgroup); - CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -180,12 +180,12 @@ GetGroupsCommand::GetGroupsCommand(string option) { else { m->setAccnosFile(accnosfile); } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -200,7 +200,7 @@ GetGroupsCommand::GetGroupsCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } @@ -253,6 +253,10 @@ GetGroupsCommand::GetGroupsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared or list."); m->mothurOutEndLine(); abort = true; } if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index b4e41e5..c2be580 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -168,12 +168,12 @@ GetLineageCommand::GetLineageCommand(string option) { //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -192,7 +192,7 @@ GetLineageCommand::GetLineageCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = m->getTaxonomyFile(); if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } @@ -217,6 +217,11 @@ GetLineageCommand::GetLineageCommand(string option) { m->splitAtChar(taxons, listOfTaxons, '-'); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/getseqscommand.cpp b/getseqscommand.cpp index ffe49b4..9bf188d 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -206,12 +206,12 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (accnosfile2 == "not found") { accnosfile2 = ""; } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -230,7 +230,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } @@ -249,6 +249,11 @@ GetSeqsCommand::GetSeqsCommand(string option) { dups = m->isTrue(temp); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } } } diff --git a/groupmap.h b/groupmap.h index 99891a4..29c1741 100644 --- a/groupmap.h +++ b/groupmap.h @@ -2,7 +2,7 @@ #define GROUPMAP_H /* * groupmap.h - * Dotur + * Mothur * * Created by Sarah Westcott on 12/1/08. * Copyright 2008 Schloss Lab UMASS Amherst. All rights reserved. diff --git a/kmer.cpp b/kmer.cpp index 6ee84d3..50574f4 100644 --- a/kmer.cpp +++ b/kmer.cpp @@ -131,6 +131,27 @@ string Kmer::getKmerBases(int kmerNumber){ } return kmer; } +/**************************************************************************************************/ + +int Kmer::getReverseKmerNumber(int kmerNumber){ + + string kmerString = getKmerBases(kmerNumber); + + //get Reverse + string reverse = ""; + for(int i=kmerString.length()-1;i>=0;i--){ + if(kmerString[i] == 'A') { reverse += 'T'; } + else if(kmerString[i] == 'T'){ reverse += 'A'; } + else if(kmerString[i] == 'G'){ reverse += 'C'; } + else if(kmerString[i] == 'C'){ reverse += 'G'; } + else { reverse += 'N'; } + } + + int reverseNumber = getKmerNumber(reverse, 0); + + return reverseNumber; + +} /**************************************************************************************************/ diff --git a/kmer.hpp b/kmer.hpp index d5b67d6..9a3c5a0 100644 --- a/kmer.hpp +++ b/kmer.hpp @@ -21,6 +21,7 @@ public: string getKmerString(string); int getKmerNumber(string, int); string getKmerBases(int); + int getReverseKmerNumber(int); vector< map > getKmerCounts(string sequence); //for use in chimeraCheck private: diff --git a/kmerdb.cpp b/kmerdb.cpp index 2703e16..9a5d235 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -57,6 +57,8 @@ KmerDB::~KmerDB(){} vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ try { + if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; } + vector topMatches; Kmer kmer(kmerSize); searchScore = 0; @@ -215,6 +217,20 @@ int KmerDB::getCount(int kmer) { } } /**************************************************************************************************/ +int KmerDB::getReversed(int kmerNumber) { + try { + Kmer kmer(kmerSize); + + if (kmerNumber < 0) { return 0; } //if user gives negative number + else if (kmerNumber > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer + else { return kmer.getReverseKmerNumber(kmerNumber); } // kmer is in vector range + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "getReversed"); + exit(1); + } +} +/**************************************************************************************************/ vector KmerDB::getSequencesWithKmer(int kmer) { try { diff --git a/kmerdb.hpp b/kmerdb.hpp index 62d4836..4ae00b9 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -36,6 +36,7 @@ public: void readKmerDB(ifstream&); int getCount(int); //returns number of sequences with that kmer number vector getSequencesWithKmer(int); //returns vector of sequences that contain kmer passed in + int getReversed(int); //returns reverse compliment kmerNumber int getMaxKmer() { return maxKmer; } private: diff --git a/knn.cpp b/knn.cpp index 6053b6e..837fa6d 100644 --- a/knn.cpp +++ b/knn.cpp @@ -72,11 +72,11 @@ string Knn::getTaxonomy(Sequence* seq) { } if (closestNames.size() == 0) { - m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); m->mothurOutEndLine(); - tax = "bad seq"; + m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine(); + tax = "unknown;"; }else{ tax = findCommonTaxonomy(closestNames); - if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); m->mothurOutEndLine(); tax = "bad seq"; } + if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; } } simpleTax = tax; @@ -90,7 +90,7 @@ string Knn::getTaxonomy(Sequence* seq) { /**************************************************************************************************/ string Knn::findCommonTaxonomy(vector closest) { try { - vector< vector > taxons; //taxon[0] = vector of taxonomy info for closest[0]. + /*vector< vector > taxons; //taxon[0] = vector of taxonomy info for closest[0]. //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea; //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria.... @@ -101,6 +101,7 @@ string Knn::findCommonTaxonomy(vector closest) { if (m->control_pressed) { return "control"; } string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy + cout << tax << endl; taxons[i] = parseTax(tax); @@ -128,9 +129,54 @@ string Knn::findCommonTaxonomy(vector closest) { } break; } + }*/ + + string conTax; + + //create a tree containing sequences from this bin + PhyloTree* p = new PhyloTree(); + + for (int i = 0; i < closest.size(); i++) { + p->addSeqToTree(closest[i], taxonomy[closest[i]]); } - - return common; + + //build tree + p->assignHeirarchyIDs(0); + + TaxNode currentNode = p->get(0); + + //at each level + while (currentNode.children.size() != 0) { //you still have more to explore + + TaxNode bestChild; + int bestChildSize = 0; + + //go through children + for (map::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) { + + TaxNode temp = p->get(itChild->second); + + //select child with largest accessions - most seqs assigned to it + if (temp.accessions.size() > bestChildSize) { + bestChild = p->get(itChild->second); + bestChildSize = temp.accessions.size(); + } + + } + + if (bestChildSize == closest.size()) { //if yes, add it + conTax += bestChild.name + ";"; + }else{ //if no, quit + break; + } + + //move down a level + currentNode = bestChild; + } + + delete p; + + return conTax; } catch(exception& e) { m->errorOut(e, "Knn", "findCommonTaxonomy"); diff --git a/makefile b/makefile index e2e66c1..476070e 100644 --- a/makefile +++ b/makefile @@ -32,6 +32,7 @@ ifeq ($(strip $(64BIT_VERSION)),yes) #if you using cygwin to build Windows the following line #CXX = x86_64-w64-mingw32-g++ #CC = x86_64-w64-mingw32-g++ + #FORTAN_COMPILER = x86_64-w64-mingw32-gfortran #TARGET_ARCH += -m64 -static #if you are a linux user use the following line diff --git a/memchi2.cpp b/memchi2.cpp index 595b54f..f08b4b8 100644 --- a/memchi2.cpp +++ b/memchi2.cpp @@ -17,7 +17,7 @@ EstOutput MemChi2::getValues(vector shared) { int nonZeroA = 0; int nonZeroB = 0; int totalOtus = shared[0]->getNumBins(); - int totalGroups = shared.size(); + //int totalGroups = shared.size(); //for each otu for (int i = 0; i < shared[0]->getNumBins(); i++) { diff --git a/mergegroupscommand.cpp b/mergegroupscommand.cpp index af2af96..e30558c 100644 --- a/mergegroupscommand.cpp +++ b/mergegroupscommand.cpp @@ -13,7 +13,8 @@ //********************************************************************************************************************** vector MergeGroupsCommand::setParameters(){ try { - CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pgroup); CommandParameter pdesign("design", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pdesign); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); @@ -33,12 +34,12 @@ vector MergeGroupsCommand::setParameters(){ string MergeGroupsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The merge.groups command reads a shared file and a design file and merges the groups in the shared file that are in the same grouping in the design file.\n"; + helpString += "The merge.groups command reads a shared or group file and a design file and merges the groups that are in the same grouping in the design file.\n"; helpString += "The merge.groups command outputs a .shared file. \n"; - helpString += "The merge.groups command parameters are shared, groups, label and design. The design and shared parameter are required.\n"; + helpString += "The merge.groups command parameters are shared, group, groups, label and design. The design parameter is required.\n"; helpString += "The design parameter allows you to assign your groups to sets. It is required. \n"; helpString += "The design file looks like the group file. It is a 2 column tab delimited file, where the first column is the group name and the second column is the set the group belongs to.\n"; - helpString += "The groups parameter allows you to specify which of the groups in your shared you would like included. The group names are separated by dashes.\n"; + helpString += "The groups parameter allows you to specify which of the groups in your shared or group file you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; helpString += "The merge.groups command should be in the following format: merge.groups(design=yourDesignFile, shared=yourSharedFile).\n"; helpString += "Example merge.groups(design=temp.design, groups=A-B-C, shared=temp.shared).\n"; @@ -58,6 +59,7 @@ MergeGroupsCommand::MergeGroupsCommand(){ setParameters(); vector tempOutNames; outputTypes["shared"] = tempOutNames; + outputTypes["group"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "MergeGroupsCommand", "MetaStatsCommand"); @@ -92,6 +94,7 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["shared"] = tempOutNames; + outputTypes["group"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -116,6 +119,15 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + } //check for required parameters @@ -128,15 +140,15 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { else { m->mothurOut("You have no current designfile and the design parameter is required."); m->mothurOutEndLine(); abort = true; } }else { m->setDesignFile(designfile); } - //make sure the user has already run the read.otu command sharedfile = validParameter.validFile(parameters, "shared", true); - if (sharedfile == "not open") { abort = true; } - else if (sharedfile == "not found") { - //if there is a current shared file, use it - sharedfile = m->getSharedFile(); - if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; } - }else { m->setSharedFile(sharedfile); } + if (sharedfile == "not open") { abort = true; sharedfile = ""; } + else if (sharedfile == "not found") { sharedfile = ""; } + else { m->setSharedFile(sharedfile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; groupfile = ""; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -151,6 +163,19 @@ MergeGroupsCommand::MergeGroupsCommand(string option) { if (groups == "not found") { groups = "all"; } m->splitAtDash(groups, Groups); m->setGroups(Groups); + + if ((sharedfile == "") && (groupfile == "")) { + //give priority to group, then shared + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current groupfile or sharedfile and one is required."); m->mothurOutEndLine(); abort = true; + } + } + } } } @@ -165,17 +190,106 @@ int MergeGroupsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } + + designMap = new GroupMap(designfile); + designMap->readDesignMap(); + + if (groupfile != "") { processGroupFile(designMap); } + if (sharedfile != "") { processSharedFile(designMap); } + + //reset groups parameter + m->clearGroups(); + delete designMap; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;} + + + //set shared file as new current sharedfile + string current = ""; + itTypes = outputTypes.find("shared"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } + } + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MergeGroupsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int MergeGroupsCommand::process(vector& thisLookUp, ofstream& out){ + try { + + map merged; + map::iterator it; + + for (int i = 0; i < thisLookUp.size(); i++) { + + if (m->control_pressed) { return 0; } + + //what grouping does this group belong to + string grouping = designMap->getGroup(thisLookUp[i]->getGroup()); + if (grouping == "not found") { m->mothurOut("[ERROR]: " + thisLookUp[i]->getGroup() + " is not in your design file. Ignoring!"); m->mothurOutEndLine(); grouping = "NOTFOUND"; } + + else { + //do we already have a member of this grouping? + it = merged.find(grouping); + + if (it == merged.end()) { //nope, so create it + merged[grouping] = *thisLookUp[i]; + merged[grouping].setGroup(grouping); + }else { //yes, merge it + + for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { + int abund = (it->second).getAbundance(j); + abund += thisLookUp[i]->getAbundance(j); + + (it->second).set(j, abund, grouping); + } + } + } + } + + //print new file + for (it = merged.begin(); it != merged.end(); it++) { + out << (it->second).getLabel() << '\t' << it->first << '\t'; + (it->second).print(out); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "MergeGroupsCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** + +int MergeGroupsCommand::processSharedFile(GroupMap*& designMap){ + try { - if (outputDir == "") { outputDir += m->hasPath(sharedfile); } - string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "merge" + m->getExtension(sharedfile); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + "merge" + m->getExtension(sharedfile); outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName); ofstream out; m->openOutputFile(outputFileName, out); - designMap = new GroupMap(designfile); - designMap->readDesignMap(); - InputData input(sharedfile, "sharedfile"); lookup = input.getSharedRAbundVectors(); string lastLabel = lookup[0]->getLabel(); @@ -256,78 +370,68 @@ int MergeGroupsCommand::execute(){ } out.close(); - //reset groups parameter - m->clearGroups(); - delete designMap; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0;} - - - //set shared file as new current sharedfile - string current = ""; - itTypes = outputTypes.find("shared"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSharedFile(current); } - } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); + return 0; + } catch(exception& e) { - m->errorOut(e, "MergeGroupsCommand", "execute"); + m->errorOut(e, "MergeGroupsCommand", "processSharedFile"); exit(1); } } //********************************************************************************************************************** -int MergeGroupsCommand::process(vector& thisLookUp, ofstream& out){ +int MergeGroupsCommand::processGroupFile(GroupMap*& designMap){ try { - map merged; - map::iterator it; + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "merge" + m->getExtension(groupfile); + outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName); - for (int i = 0; i < thisLookUp.size(); i++) { + ofstream out; + m->openOutputFile(outputFileName, out); + + //read groupfile + GroupMap groupMap(groupfile); + groupMap.readMap(); + + //fill Groups - checks for "all" and for any typo groups + SharedUtil* util = new SharedUtil(); + vector nameGroups = groupMap.getNamesOfGroups(); + util->setGroups(Groups, nameGroups); + delete util; + + vector namesOfSeqs = groupMap.getNamesSeqs(); + bool error = false; + + for (int i = 0; i < namesOfSeqs.size(); i++) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { break; } - //what grouping does this group belong to - string grouping = designMap->getGroup(thisLookUp[i]->getGroup()); - if (grouping == "not found") { m->mothurOut("[ERROR]: " + thisLookUp[i]->getGroup() + " is not in your design file. Ignoring!"); m->mothurOutEndLine(); grouping = "NOTFOUND"; } + string thisGroup = groupMap.getGroup(namesOfSeqs[i]); - else { - //do we already have a member of this grouping? - it = merged.find(grouping); + //are you in a group the user wants + if (m->inUsersGroups(thisGroup, Groups)) { + string thisGrouping = designMap->getGroup(thisGroup); - if (it == merged.end()) { //nope, so create it - merged[grouping] = *thisLookUp[i]; - merged[grouping].setGroup(grouping); - }else { //yes, merge it - - for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) { - int abund = (it->second).getAbundance(j); - abund += thisLookUp[i]->getAbundance(j); - - (it->second).set(j, abund, grouping); - } + if (thisGrouping == "not found") { m->mothurOut("[ERROR]: " + namesOfSeqs[i] + " is from group " + thisGroup + " which is not in your design file, please correct."); m->mothurOutEndLine(); error = true; } + else { + out << namesOfSeqs[i] << '\t' << thisGrouping << endl; } } } - //print new file - for (it = merged.begin(); it != merged.end(); it++) { - out << (it->second).getLabel() << '\t' << it->first << '\t'; - (it->second).print(out); - } + if (error) { m->control_pressed = true; } + + out.close(); return 0; } catch(exception& e) { - m->errorOut(e, "MergeGroupsCommand", "process"); + m->errorOut(e, "MergeGroupsCommand", "processGroupFile"); exit(1); } } diff --git a/mergegroupscommand.h b/mergegroupscommand.h index 7907dfe..7f216eb 100644 --- a/mergegroupscommand.h +++ b/mergegroupscommand.h @@ -38,10 +38,12 @@ private: bool abort, allLines, pickedGroups; set labels; //holds labels to be used - string groups, label, outputDir, inputDir, designfile, sharedfile; + string groups, label, outputDir, inputDir, designfile, sharedfile, groupfile; vector Groups, outputNames; int process(vector&, ofstream&); + int processSharedFile(GroupMap*&); + int processGroupFile(GroupMap*&); }; #endif diff --git a/mothur.h b/mothur.h index 3b7c459..50344e2 100644 --- a/mothur.h +++ b/mothur.h @@ -106,6 +106,19 @@ struct ThreadNode { IntNode* right; }; +struct diffPair { + float prob; + float reverseProb; + + diffPair() { + prob = 0; reverseProb = 0; + } + diffPair(float p, float rp) { + prob = p; + reverseProb = rp; + } +}; + /************************************************************/ struct clusterNode { int numSeq; diff --git a/mothurout.cpp b/mothurout.cpp index 9dcdc80..20a7b52 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2152,7 +2152,7 @@ int MothurOut::removeConfidences(string& tax) { int pos2 = taxon.find_last_of(')'); if (pos2 != -1) { string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1))); - if (isContainingOnlyDigits(confidenceScore)) { + if (isNumeric1(confidenceScore)) { taxon = taxon.substr(0, pos); //rip off confidence } } diff --git a/optionparser.cpp b/optionparser.cpp index 71714f1..1e8c445 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -104,7 +104,7 @@ map OptionParser::getParameters() { } }else{ it++; } } - + return parameters; } catch(exception& e) { @@ -113,4 +113,54 @@ map OptionParser::getParameters() { } } +/***********************************************************************/ +//pass a vector of filenames that may match the current namefile. +//this function will look at each one, if the rootnames match, mothur will warn +//the user that they may have neglected to provide a namefile. +//stops when it finds a match. +bool OptionParser::getNameFile(vector files) { + try { + string namefile = m->getNameFile(); + bool match = false; + + if (namefile != "") { + string temp = m->getRootName(m->getSimpleName(namefile)); + vector rootName; + m->splitAtChar(temp, rootName, '.'); + + for (int i = 0; i < files.size(); i++) { + temp = m->getRootName(m->getSimpleName(files[i])); + vector root; + m->splitAtChar(temp, root, '.'); + + int smallest = rootName.size(); + if (root.size() < smallest) { smallest = root.size(); } + + int numMatches = 0; + for(int j = 0; j < smallest; j++) { + if (root[j] == rootName[j]) { numMatches++; } + } + + if (smallest > 0) { + if ((numMatches >= (smallest-2)) && (root[0] == rootName[0])) { + m->mothurOut("[WARNING]: This command can take a namefile and you did not provide one. The current namefile is " + namefile + " which seems to match " + files[i] + "."); + m->mothurOutEndLine(); + match = true; + break; + } + } + } + + } + + + return match; + } + catch(exception& e) { + m->errorOut(e, "OptionParser", "getNameFile"); + exit(1); + } +} + + /***********************************************************************/ diff --git a/optionparser.h b/optionparser.h index 7707e73..facd1f8 100644 --- a/optionparser.h +++ b/optionparser.h @@ -14,7 +14,7 @@ #include "mothur.h" #include "mothurout.h" - +#include "command.hpp" /***********************************************************************/ @@ -23,6 +23,7 @@ public: OptionParser(string); ~OptionParser() {} map getParameters(); + bool getNameFile(vector); private: map parameters; MothurOut* m; diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 9cba70b..9509a20 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -14,6 +14,8 @@ vector ParseFastaQCommand::setParameters(){ try { CommandParameter pfastq("fastq", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfastq); + CommandParameter pfasta("fasta", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pfasta); + CommandParameter pqual("qfile", "Bool", "", "T", "", "", "",false,false); parameters.push_back(pqual); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -104,6 +106,15 @@ ParseFastaQCommand::ParseFastaQCommand(string option){ //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastaQFile); } + + string temp; + temp = validParameter.validFile(parameters, "fasta", false); if(temp == "not found"){ temp = "T"; } + fasta = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "qfile", false); if(temp == "not found"){ temp = "T"; } + qual = m->isTrue(temp); + + if ((!fasta) && (!qual)) { m->mothurOut("[ERROR]: no outputs selected. Aborting."); m->mothurOutEndLine(); abort=true; } } } @@ -122,13 +133,16 @@ int ParseFastaQCommand::execute(){ string fastaFile = outputDir + m->getRootName(m->getSimpleName(fastaQFile)) + "fasta"; string qualFile = outputDir + m->getRootName(m->getSimpleName(fastaQFile)) + "qual"; ofstream outFasta, outQual; - m->openOutputFile(fastaFile, outFasta); outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile); - m->openOutputFile(qualFile, outQual); outputNames.push_back(qualFile); outputTypes["qfile"].push_back(qualFile); + + if (fasta) { m->openOutputFile(fastaFile, outFasta); outputNames.push_back(fastaFile); outputTypes["fasta"].push_back(fastaFile); } + if (qual) { m->openOutputFile(qualFile, outQual); outputNames.push_back(qualFile); outputTypes["qfile"].push_back(qualFile); } ifstream in; m->openInputFile(fastaQFile, in); while (!in.eof()) { + + if (m->control_pressed) { break; } //read sequence name string name = m->getline(in); m->gobble(in); @@ -147,27 +161,27 @@ int ParseFastaQCommand::execute(){ else { name2 = name2.substr(1); } //read quality scores - string qual = m->getline(in); m->gobble(in); - if (qual == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } + string quality = m->getline(in); m->gobble(in); + if (quality == "") { m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; } //sanity check sequence length and number of quality scores match if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } } - if (qual.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(qual.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } - - //convert quality scores - vector qualScores = convertQual(qual); + if (quality.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; } //print sequence info to files - outFasta << ">" << name << endl << sequence << endl; + if (fasta) { outFasta << ">" << name << endl << sequence << endl; } - outQual << ">" << name << endl; - for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } - outQual << endl; + if (qual) { + vector qualScores = convertQual(quality); + outQual << ">" << name << endl; + for (int i = 0; i < qualScores.size(); i++) { outQual << qualScores[i] << " "; } + outQual << endl; + } } in.close(); - outFasta.close(); - outQual.close(); + if (fasta) { outFasta.close(); } + if (qual) { outQual.close(); } if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(fastaFile); m->mothurRemove(qualFile); return 0; } diff --git a/parsefastaqcommand.h b/parsefastaqcommand.h index 6052456..6feabce 100644 --- a/parsefastaqcommand.h +++ b/parsefastaqcommand.h @@ -34,7 +34,7 @@ private: vector outputNames; string outputDir, fastaQFile; - bool abort; + bool abort, fasta, qual; vector convertQual(string); }; diff --git a/parsimonycommand.cpp b/parsimonycommand.cpp index ccf2952..2d46efc 100644 --- a/parsimonycommand.cpp +++ b/parsimonycommand.cpp @@ -139,7 +139,7 @@ ParsimonyCommand::ParsimonyCommand(string option) { if (randomtree == "") { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); } @@ -153,7 +153,7 @@ ParsimonyCommand::ParsimonyCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } } @@ -177,6 +177,11 @@ ParsimonyCommand::ParsimonyCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + } } diff --git a/phylodiversitycommand.cpp b/phylodiversitycommand.cpp index 69c7b30..2b15d11 100644 --- a/phylodiversitycommand.cpp +++ b/phylodiversitycommand.cpp @@ -144,7 +144,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); @@ -159,7 +159,7 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -197,6 +197,11 @@ PhyloDiversityCommand::PhyloDiversityCommand(string option) { } if ((!collect) && (!rarefy) && (!summary)) { m->mothurOut("No outputs selected. You must set either collect, rarefy or summary to true, summary=T by default."); m->mothurOutEndLine(); abort=true; } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } } diff --git a/phylosummary.cpp b/phylosummary.cpp index 22e8e75..7d9fcac 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -121,6 +121,7 @@ string PhyloSummary::getNextTaxon(string& heirarchy){ int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ try { + numSeqs++; map::iterator childPointer; diff --git a/phylotree.cpp b/phylotree.cpp index e430fb9..dba1e3b 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -20,6 +20,7 @@ PhyloTree::PhyloTree(){ tree[0].heirarchyID = "0"; maxLevel = 0; calcTotals = true; + addSeqToTree("unknown", "unknown;"); } catch(exception& e) { m->errorOut(e, "PhyloTree", "PhyloTree"); @@ -127,6 +128,7 @@ PhyloTree::PhyloTree(string tfile){ maxLevel = 0; calcTotals = true; string name, tax; + addSeqToTree("unknown", "unknown;"); #ifdef USE_MPI @@ -232,7 +234,6 @@ string PhyloTree::getNextTaxon(string& heirarchy, string seqname){ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ try { - numSeqs++; map::iterator childPointer; @@ -375,7 +376,7 @@ void PhyloTree::binUnclassified(string file){ map::iterator childPointer; vector copy = tree; - + //fill out tree fillOutTree(0, copy); @@ -484,16 +485,16 @@ string PhyloTree::getFullTaxonomy(string seqName) { void PhyloTree::print(ofstream& out, vector& copy){ try { - + //output mothur version out << "#" << m->getVersion() << endl; out << copy.size() << endl; out << maxLevel << endl; - + for (int i = 0; i < copy.size(); i++) { - + out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; map::iterator it; @@ -606,6 +607,7 @@ bool PhyloTree::ErrorCheck(vector templateFileNames){ try { bool okay = true; + templateFileNames.push_back("unknown"); map::iterator itFind; map taxonomyFileNames = name2Taxonomy; diff --git a/phylotypecommand.cpp b/phylotypecommand.cpp index 00f960b..1d6c391 100644 --- a/phylotypecommand.cpp +++ b/phylotypecommand.cpp @@ -128,11 +128,11 @@ PhylotypeCommand::PhylotypeCommand(string option) { m->mothurOut("No valid current files. taxonomy is a required parameter."); m->mothurOutEndLine(); abort = true; } - }else if (taxonomyFileName == "not open") { abort = true; } + }else if (taxonomyFileName == "not open") { taxonomyFileName = ""; abort = true; } else { m->setTaxonomyFile(taxonomyFileName); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { readNamesFile(); m->setNameFile(namefile); } @@ -153,6 +153,11 @@ PhylotypeCommand::PhylotypeCommand(string option) { else { allLines = 1; } } + if (namefile == "") { + vector files; files.push_back(taxonomyFileName); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 21de034..582da49 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -147,7 +147,7 @@ PreClusterCommand::PreClusterCommand(string option) { // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not found") { namefile = ""; } - else if (namefile == "not open") { abort = true; } + else if (namefile == "not open") { namefile = ""; abort = true; } else { m->setNameFile(namefile); } groupfile = validParameter.validFile(parameters, "group", true); @@ -162,7 +162,10 @@ PreClusterCommand::PreClusterCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/referencedb.cpp b/referencedb.cpp index fb930ef..36400c1 100644 --- a/referencedb.cpp +++ b/referencedb.cpp @@ -22,6 +22,7 @@ void ReferenceDB::clearMemory() { setSavedReference(""); for(int i = 0; i < wordGenusProb.size(); i++) { wordGenusProb[i].clear(); } wordGenusProb.clear(); + WordPairDiffArr.clear(); setSavedTaxonomy(""); } /******************************************************* diff --git a/referencedb.h b/referencedb.h index 2f29247..5262e80 100644 --- a/referencedb.h +++ b/referencedb.h @@ -26,6 +26,7 @@ class ReferenceDB { bool save; vector referenceSeqs; vector< vector > wordGenusProb; + vector WordPairDiffArr; string getSavedReference() { return referencefile; } void setSavedReference(string p) { referencefile = p; } diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp index e5a3bb2..f35da2e 100644 --- a/removegroupscommand.cpp +++ b/removegroupscommand.cpp @@ -16,12 +16,12 @@ //********************************************************************************************************************** vector RemoveGroupsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pshared("shared", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pshared); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT-sharedGroup", "none",false,false); parameters.push_back(pgroup); - CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(pfasta); + CommandParameter pshared("shared", "InputTypes", "", "", "none", "sharedGroup", "none",false,false); parameters.push_back(pshared); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "sharedGroup", "FNGLT",false,false); parameters.push_back(pgroup); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(plist); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FNGLT",false,false); parameters.push_back(ptaxonomy); CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -253,6 +253,12 @@ RemoveGroupsCommand::RemoveGroupsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (sharedfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, taxonomy, group, shared or list."); m->mothurOutEndLine(); abort = true; } if ((groupfile == "") && ((namefile != "") || (fastafile != "") || (listfile != "") || (taxfile != ""))) { m->mothurOut("If using a fasta, name, taxonomy, group or list, then you must provide a group file."); m->mothurOutEndLine(); abort = true; } + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index f55158f..9fec249 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -169,12 +169,12 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -193,7 +193,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { else { m->setListFile(listfile); } taxfile = validParameter.validFile(parameters, "taxonomy", true); - if (taxfile == "not open") { abort = true; } + if (taxfile == "not open") { taxfile = ""; abort = true; } else if (taxfile == "not found") { taxfile = m->getTaxonomyFile(); if (taxfile != "") { m->mothurOut("Using " + taxfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } @@ -220,7 +220,12 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } - + + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 59c9b34..2abe9ef 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -194,12 +194,12 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { }else { m->setAccnosFile(accnosfile); } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = ""; } else { m->setFastaFile(fastafile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -237,6 +237,10 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 16618f7..4106739 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -260,6 +260,11 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index fe80b78..24142e0 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -122,7 +122,7 @@ AlignCheckCommand::AlignCheckCommand(string option) { else if (mapfile == "not found") { mapfile = ""; m->mothurOut("You must provide an map file."); m->mothurOutEndLine(); abort = true; } fastafile = validParameter.validFile(parameters, "fasta", true); - if (fastafile == "not open") { abort = true; } + if (fastafile == "not open") { fastafile = ""; abort = true; } else if (fastafile == "not found") { fastafile = m->getFastaFile(); if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } @@ -139,7 +139,11 @@ AlignCheckCommand::AlignCheckCommand(string option) { outputDir = ""; outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it } - + + if ((namefile == "") && (fastafile != "")){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 9ac7c5b..d8ebe50 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -176,7 +176,7 @@ SeqErrorCommand::SeqErrorCommand(string option) { if (queryFileName != "") { m->mothurOut("Using " + queryFileName + " as input file for the fasta parameter."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } } - else if (queryFileName == "not open") { abort = true; } + else if (queryFileName == "not open") { queryFileName = ""; abort = true; } else { m->setFastaFile(queryFileName); } referenceFileName = validParameter.validFile(parameters, "reference", true); @@ -246,6 +246,11 @@ SeqErrorCommand::SeqErrorCommand(string option) { substitutionMatrix.resize(6); for(int i=0;i<6;i++){ substitutionMatrix[i].resize(6,0); } + + if ((namesFileName == "") && (queryFileName != "")){ + vector files; files.push_back(queryFileName); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 93950d1..e8f73ca 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -132,8 +132,12 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); - - + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/sequence.cpp b/sequence.cpp index 162d3be..090ee14 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -604,7 +604,6 @@ void Sequence::padFromPos(int end){ bool Sequence::getIsAligned(){ return isAligned; } - //******************************************************************************************************************** void Sequence::reverseComplement(){ diff --git a/setdircommand.cpp b/setdircommand.cpp index 17bad60..67a1f59 100644 --- a/setdircommand.cpp +++ b/setdircommand.cpp @@ -202,8 +202,19 @@ int SetDirectoryCommand::execute(){ if (lastChar != "\\") { tempdefault += "\\"; } #endif - m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine(); - m->setDefaultPath(tempdefault); + //test to make sure directory exists + tempdefault = m->getFullPathName(tempdefault); + string inTemp = tempdefault + tag + "temp"; + ofstream in; + in.open(inTemp.c_str(), ios::trunc); + if(!in) { + m->mothurOut(tempdefault + " directory does not exist or is not writable."); m->mothurOutEndLine(); + }else{ + in.close(); + m->mothurRemove(inTemp); + m->mothurOut("tempDefault=" + tempdefault); m->mothurOutEndLine(); + m->setDefaultPath(tempdefault); + } } return 0; diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 40a4c84..5378211 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -23,7 +23,7 @@ vector ShhherCommand::setParameters(){ try { CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow); CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile); - CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plookup); + CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup); CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter); @@ -2152,7 +2152,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string qualityFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.qual"; + string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.qual"; ofstream qualityFile; m->openOutputFile(qualityFileName, qualityFile); @@ -2259,7 +2259,7 @@ void ShhherCommand::writeSequences(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fastaFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.fasta"; + string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.fasta"; ofstream fastaFile; m->openOutputFile(fastaFileName, fastaFile); @@ -2307,7 +2307,7 @@ void ShhherCommand::writeNames(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string nameFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; + string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.names"; ofstream nameFile; m->openOutputFile(nameFileName, nameFile); @@ -2345,7 +2345,7 @@ void ShhherCommand::writeGroups(){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fileRoot = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')); + string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)); string groupFileName = fileRoot + ".shhh.groups"; ofstream groupFile; m->openOutputFile(groupFileName, groupFile); @@ -2370,7 +2370,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string otuCountsFileName = thisOutputDir + flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.counts"; + string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.counts"; ofstream otuCountsFile; m->openOutputFile(otuCountsFileName, otuCountsFile); diff --git a/shhhseqscommand.cpp b/shhhseqscommand.cpp index 6a339b8..518e78d 100644 --- a/shhhseqscommand.cpp +++ b/shhhseqscommand.cpp @@ -165,6 +165,11 @@ ShhhSeqsCommand::ShhhSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/splitabundcommand.cpp b/splitabundcommand.cpp index 989ef59..8131fa9 100644 --- a/splitabundcommand.cpp +++ b/splitabundcommand.cpp @@ -216,7 +216,6 @@ SplitAbundCommand::SplitAbundCommand(string option) { m->mothurConvert(temp, cutoff); if (cutoff == 0) { m->mothurOut("You must provide a cutoff to qualify what is abundant for the split.abund command. "); m->mothurOutEndLine(); abort = true; } - } } diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp index 113439c..2671655 100644 --- a/splitgroupscommand.cpp +++ b/splitgroupscommand.cpp @@ -125,7 +125,7 @@ SplitGroupCommand::SplitGroupCommand(string option) { namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -151,6 +151,11 @@ SplitGroupCommand::SplitGroupCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(groupfile); } + + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp index fe9e934..c352feb 100644 --- a/subsamplecommand.cpp +++ b/subsamplecommand.cpp @@ -255,6 +255,10 @@ SubSampleCommand::SubSampleCommand(string option) { if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; } + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } } } diff --git a/summaryqualcommand.cpp b/summaryqualcommand.cpp index 421a8f2..2a40e08 100644 --- a/summaryqualcommand.cpp +++ b/summaryqualcommand.cpp @@ -111,7 +111,7 @@ SummaryQualCommand::SummaryQualCommand(string option) { //check for required parameters qualfile = validParameter.validFile(parameters, "qfile", true); - if (qualfile == "not open") { abort = true; } + if (qualfile == "not open") { qualfile = ""; abort = true; } else if (qualfile == "not found") { qualfile = m->getQualFile(); if (qualfile != "") { m->mothurOut("Using " + qualfile + " as input file for the qfile parameter."); m->mothurOutEndLine(); } @@ -131,7 +131,12 @@ SummaryQualCommand::SummaryQualCommand(string option) { string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - m->mothurConvert(temp, processors); + m->mothurConvert(temp, processors); + + if (namefile == "") { + vector files; files.push_back(qualfile); + parser.getNameFile(files); + } } } catch(exception& e) { diff --git a/summarytaxcommand.cpp b/summarytaxcommand.cpp index a19986a..ffa3041 100644 --- a/summarytaxcommand.cpp +++ b/summarytaxcommand.cpp @@ -157,6 +157,12 @@ SummaryTaxCommand::SummaryTaxCommand(string option) { outputDir = ""; outputDir += m->hasPath(taxfile); //if user entered a file with a path then preserve it } + + if (namefile == "") { + vector files; files.push_back(taxfile); + parser.getNameFile(files); + } + } } catch(exception& e) { diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 213e241..dd3427b 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -292,6 +292,11 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { m->mothurOut("You didn't set any options... quiting command."); m->mothurOutEndLine(); abort = true; } + + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } } } diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 047d06e..a404f79 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -155,7 +155,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -198,6 +198,11 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { m->splitAtDash(groups, Groups); m->setGroups(Groups); } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index b007db1..f9dc450 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -141,7 +141,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { //check for required parameters treefile = validParameter.validFile(parameters, "tree", true); - if (treefile == "not open") { abort = true; } + if (treefile == "not open") { treefile = ""; abort = true; } else if (treefile == "not found") { //if there is a current design file, use it treefile = m->getTreeFile(); if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); } @@ -155,7 +155,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { else { m->setGroupFile(groupfile); } namefile = validParameter.validFile(parameters, "name", true); - if (namefile == "not open") { abort = true; } + if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } @@ -192,6 +192,11 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { m->mothurConvert(temp, processors); if (!random) { iters = 0; } //turn off random calcs + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } } -- 2.39.2