From 0319c36f068f092c766e1bff34b3554c1858255a Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 10 May 2010 16:59:29 +0000 Subject: [PATCH] added make.group command --- Mothur.xcodeproj/project.pbxproj | 4 +-- classifyseqscommand.cpp | 60 +++++++++++++++++++------------ classifyseqscommand.h | 4 +-- commandfactory.cpp | 3 ++ makefile | 14 +++++--- makegroupcommand.cpp | 61 ++++++++++++++++++++------------ makegroupcommand.h | 2 +- 7 files changed, 94 insertions(+), 54 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index eb9edec..29de1db 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -654,8 +654,8 @@ A78254471164D7790002E2DD /* chimerapintailcommand.cpp */, A747E81C116365E000FB9042 /* chimeraslayercommand.h */, A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */, - A7DA201D113FECD400BF472F /* classifyseqscommand.cpp */, A7DA201E113FECD400BF472F /* classifyseqscommand.h */, + A7DA201D113FECD400BF472F /* classifyseqscommand.cpp */, A7DA2021113FECD400BF472F /* clustercommand.cpp */, A7DA2022113FECD400BF472F /* clustercommand.h */, A7DA2025113FECD400BF472F /* collectcommand.cpp */, @@ -682,8 +682,8 @@ A7DA2063113FECD400BF472F /* getrabundcommand.h */, A7DA2064113FECD400BF472F /* getsabundcommand.cpp */, A7DA2065113FECD400BF472F /* getsabundcommand.h */, - A7DA2066113FECD400BF472F /* getseqscommand.cpp */, A7DA2067113FECD400BF472F /* getseqscommand.h */, + A7DA2066113FECD400BF472F /* getseqscommand.cpp */, A7DA2068113FECD400BF472F /* getsharedotucommand.cpp */, A7DA2069113FECD400BF472F /* getsharedotucommand.h */, A7DA2074113FECD400BF472F /* hclustercommand.cpp */, diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 39f28d0..8588328 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -413,8 +413,6 @@ int ClassifySeqsCommand::execute(){ //delete inFileName; if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } - - if(namefile != "") { MPIReadNamesFile(namefileNames[s]); } if (pid == 0) { //you are the root process @@ -465,22 +463,7 @@ int ClassifySeqsCommand::execute(){ MPI_File_close(&outMPITempTax); #else - //read namefile - if(namefile != "") { - nameMap.clear(); //remove old names - - ifstream inNames; - openInputFile(namefileNames[s], inNames); - - string firstCol, secondCol; - while(!inNames.eof()) { - inNames >> firstCol >> secondCol; gobble(inNames); - nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. - } - inNames.close(); - } - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ ifstream inFASTA; openInputFile(fastaFileNames[s], inFASTA); @@ -545,6 +528,32 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI if (pid == 0) { //this part does not need to be paralellized + + if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); MPIReadNamesFile(namefileNames[s]); m->mothurOut(" Done."); m->mothurOutEndLine(); } + #else + //read namefile + if(namefile != "") { + + m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); + + nameMap.clear(); //remove old names + + ifstream inNames; + openInputFile(namefileNames[s], inNames); + + string firstCol, secondCol; + while(!inNames.eof()) { + inNames >> firstCol >> secondCol; gobble(inNames); + + vector temp; + splitAtComma(secondCol, temp); + + nameMap[firstCol] = temp; + } + inNames.close(); + + m->mothurOut(" Done."); m->mothurOutEndLine(); + } #endif m->mothurOutEndLine(); @@ -557,7 +566,7 @@ int ClassifySeqsCommand::execute(){ PhyloSummary taxaSum(taxonomyFileName, group); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } - + if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } else { ifstream in; @@ -565,6 +574,7 @@ int ClassifySeqsCommand::execute(){ //read in users taxonomy file and add sequences to tree string name, taxon; + while(!in.eof()){ in >> name >> taxon; gobble(in); @@ -573,9 +583,11 @@ int ClassifySeqsCommand::execute(){ if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); }else{ - for (int i = 0; i < itNames->second; i++) { - taxaSum.addSeqToTree(name, taxon); //add it as many times as there are identical seqs + for (int i = 0; i < itNames->second.size(); i++) { + taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs } + itNames->second.clear(); + nameMap.erase(itNames->first); } } in.close(); @@ -886,7 +898,11 @@ int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){ string firstCol, secondCol; while(!iss.eof()) { iss >> firstCol >> secondCol; gobble(iss); - nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. + + vector temp; + splitAtComma(secondCol, temp); + + nameMap[firstCol] = temp; } MPI_File_close(&inMPI); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 639ae65..1dc4c1b 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -44,8 +44,8 @@ private: vector fastaFileNames; vector namefileNames; vector groupfileNames; - map nameMap; - map::iterator itNames; + map > nameMap; + map >::iterator itNames; Classify* classify; diff --git a/commandfactory.cpp b/commandfactory.cpp index 30b76f8..e2d114a 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -73,6 +73,7 @@ #include "chimerabellerophoncommand.h" #include "setlogfilecommand.h" #include "phylodiversitycommand.h" +#include "makegroupcommand.h" /*******************************************************/ @@ -153,6 +154,7 @@ CommandFactory::CommandFactory(){ commands["parse.sff"] = "parse.sff"; commands["set.logfile"] = "set.logfile"; commands["phylo.diversity"] = "phylo.diversity"; + commands["make.group"] = "make.group"; commands["classify.seqs"] = "MPIEnabled"; commands["dist.seqs"] = "MPIEnabled"; commands["filter.seqs"] = "MPIEnabled"; @@ -270,6 +272,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "parse.list") { command = new ParseListCommand(optionString); } else if(commandName == "parse.sff") { command = new ParseSFFCommand(optionString); } else if(commandName == "phylo.diversity") { command = new PhyloDiversityCommand(optionString); } + else if(commandName == "make.group") { command = new MakeGroupCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/makefile b/makefile index b421ca2..ea9f9ac 100644 --- a/makefile +++ b/makefile @@ -145,7 +145,8 @@ mothur : \ ./fileoutput.o\ ./globaldata.o\ ./groupmap.o\ - ./helpcommand.o\ + ./helpcommand.o\ + ./makegroupcommand.o\ ./inputdata.o\ ./jackknife.o\ ./kmer.o\ @@ -346,7 +347,8 @@ mothur : \ ./fileoutput.o\ ./globaldata.o\ ./groupmap.o\ - ./helpcommand.o\ + ./helpcommand.o\ + ./makegroupcommand.o\ ./inputdata.o\ ./jackknife.o\ ./kmer.o\ @@ -445,7 +447,7 @@ mothur : \ ./logsd.o\ ./geom.o\ ./setlogfilecommand.o\ - -o mothur + -o ../Release/mothur clean : rm \ @@ -550,7 +552,8 @@ clean : ./fileoutput.o\ ./globaldata.o\ ./groupmap.o\ - ./helpcommand.o\ + ./helpcommand.o\ + ./makegroupcommand.o\ ./inputdata.o\ ./jackknife.o\ ./kmer.o\ @@ -1649,6 +1652,9 @@ install : mothur ./phylodiversitycommand.o : phylodiversitycommand.cpp $(CC) $(CC_OPTIONS) phylodiversitycommand.cpp -c $(INCLUDE) -o ./phylodiversitycommand.o +# Item # 201 -- makegroupcommand -- +./makegroupcommand.o : makegroupcommand.cpp + $(CC) $(CC_OPTIONS) makegroupcommand.cpp -c $(INCLUDE) -o ./makegroupcommand.o ##### END RUN #### diff --git a/makegroupcommand.cpp b/makegroupcommand.cpp index 24c2de2..f567fbd 100644 --- a/makegroupcommand.cpp +++ b/makegroupcommand.cpp @@ -8,6 +8,7 @@ */ #include "makegroupcommand.h" +#include "sequence.hpp" //********************************************************************************************************************** @@ -44,6 +45,7 @@ MakeGroupCommand::MakeGroupCommand(string option) { string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } + filename = outputDir; fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the make.group command."); m->mothurOutEndLine(); abort = true; } @@ -65,18 +67,24 @@ MakeGroupCommand::MakeGroupCommand(string option) { in.close(); if (ableToOpen == 1) { - m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); //erase from file list fastaFileNames.erase(fastaFileNames.begin()+i); i--; - } + }else{ filename += getRootName(getSimpleName(fastaFileNames[i])); } } + filename += "groups"; + //make sure there is at least one valid file left if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } } - + + groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { m->mothurOut("groups is a required parameter for the make.group command."); m->mothurOutEndLine(); abort = true; } + else { splitAtDash(groups, groupsNames); } + + if (groupsNames.size() != fastaFileNames.size()) { m->mothurOut("You do not have the same number of valid fastfile files as groups. This could be because we could not open a fastafile."); m->mothurOutEndLine(); abort = true; } } } @@ -94,24 +102,12 @@ MakeGroupCommand::~MakeGroupCommand(){ } void MakeGroupCommand::help(){ try { - m->mothurOut("The align.seqs command reads a file containing sequences and creates an alignment file and a report file.\n"); - m->mothurOut("The align.seqs command parameters are template, candidate, search, ksize, align, match, mismatch, gapopen and gapextend.\n"); - m->mothurOut("The template and candidate parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); - m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); - m->mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"); - m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); - m->mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); - m->mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); - m->mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"); - m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"); - m->mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse complement if a sequence falls below the threshold. The default is false.\n"); - m->mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. The default threshold is 0.50, meaning 50% of the bases are removed in the alignment.\n"); - m->mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n"); - m->mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n"); - m->mothurOut("The align.seqs command should be in the following format: \n"); - m->mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n"); - m->mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. candidate), '=' and parameters (i.e.yourFastaFile).\n\n"); + m->mothurOut("The make.group command reads a fasta file or series of fasta files and creates a groupfile.\n"); + m->mothurOut("The make.group command parameters are fasta and groups, both are required.\n"); + m->mothurOut("The make.group command should be in the following format: \n"); + m->mothurOut("make.group(fasta=yourFastaFiles, groups=yourGroups. \n"); + m->mothurOut("Example make.group(fasta=seqs1.fasta-seq2.fasta-seqs3.fasta, groups=A-B-C)\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFiles).\n\n"); } catch(exception& e) { m->errorOut(e, "MakeGroupCommand", "help"); @@ -126,10 +122,27 @@ int MakeGroupCommand::execute(){ try { if (abort == true) { return 0; } + ofstream out; + openOutputFile(filename, out); + + for (int i = 0; i < fastaFileNames.size(); i++) { + + ifstream in; + openInputFile(fastaFileNames[i], in); + + while (!in.eof()) { + + Sequence seq(in); gobble(in); + + if (seq.getName() != "") { out << seq.getName() << '\t' << groupsNames[i] << endl; } + } + in.close(); + } + + out.close(); m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - //for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOut("Output File Name: " + filename); m->mothurOutEndLine(); m->mothurOutEndLine(); return 0; @@ -139,4 +152,6 @@ int MakeGroupCommand::execute(){ exit(1); } } +//********************************************************************************************************************** + diff --git a/makegroupcommand.h b/makegroupcommand.h index 76435bf..23d4598 100644 --- a/makegroupcommand.h +++ b/makegroupcommand.h @@ -22,7 +22,7 @@ public: private: - string fastaFileName, groups, outputDir; + string fastaFileName, groups, outputDir, filename; vector fastaFileNames; vector groupsNames; -- 2.39.2