From: westcott Date: Fri, 24 Sep 2010 14:03:47 +0000 (+0000) Subject: added get.lineage and remove.lineage commands X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=ca0785e447ca7aa7e2f0ab8bb1155db126b551ea added get.lineage and remove.lineage commands --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 2721d08..9c147b8 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -64,6 +64,10 @@ A7825503116519F70002E2DD /* chimerabellerophoncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerabellerophoncommand.cpp; sourceTree = ""; }; A78434881162224F00100BE0 /* chimeraccodecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraccodecommand.h; sourceTree = ""; }; A78434891162224F00100BE0 /* chimeraccodecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraccodecommand.cpp; sourceTree = ""; }; + A787A24F124CB46C0076EB84 /* getlineagecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlineagecommand.h; sourceTree = ""; }; + A787A250124CB46C0076EB84 /* getlineagecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlineagecommand.cpp; sourceTree = ""; }; + A787A28E124CE1470076EB84 /* removelineagecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removelineagecommand.h; sourceTree = ""; }; + A787A28F124CE1470076EB84 /* removelineagecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removelineagecommand.cpp; sourceTree = ""; }; A798626D1240D91B005FC847 /* normalizesharedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = normalizesharedcommand.h; sourceTree = ""; }; A798626E1240D91B005FC847 /* normalizesharedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = normalizesharedcommand.cpp; sourceTree = ""; }; A7AE6302121C3408001DE6FC /* sharedrabundfloatvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedrabundfloatvector.h; sourceTree = ""; }; @@ -772,6 +776,8 @@ A7DA205B113FECD400BF472F /* getgroupcommand.h */, A7DA205C113FECD400BF472F /* getlabelcommand.cpp */, A7DA205D113FECD400BF472F /* getlabelcommand.h */, + A787A24F124CB46C0076EB84 /* getlineagecommand.h */, + A787A250124CB46C0076EB84 /* getlineagecommand.cpp */, A7DA205F113FECD400BF472F /* getlistcountcommand.h */, A7DA205E113FECD400BF472F /* getlistcountcommand.cpp */, A7DA2061113FECD400BF472F /* getoturepcommand.h */, @@ -840,6 +846,8 @@ A7DA20EA113FECD400BF472F /* readotucommand.cpp */, A7DA20F1113FECD400BF472F /* readtreecommand.h */, A7DA20F0113FECD400BF472F /* readtreecommand.cpp */, + A787A28E124CE1470076EB84 /* removelineagecommand.h */, + A787A28F124CE1470076EB84 /* removelineagecommand.cpp */, A7DA20F3113FECD400BF472F /* removeseqscommand.h */, A7DA20F2113FECD400BF472F /* removeseqscommand.cpp */, A7DA20F5113FECD400BF472F /* reversecommand.h */, diff --git a/commandfactory.cpp b/commandfactory.cpp index 4651a07..94da28f 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -89,6 +89,8 @@ #include "metastatscommand.h" #include "splitgroupscommand.h" #include "clusterfragmentscommand.h" +#include "getlineagecommand.h" +#include "removelineagecommand.h" /*******************************************************/ @@ -182,6 +184,8 @@ CommandFactory::CommandFactory(){ commands["metastats"] = "metastats"; commands["split.groups"] = "split.groups"; commands["cluster.fragments"] = "cluster.fragments"; + commands["get.lineage"] = "get.lineage"; + commands["remove.lineage"] = "remove.lineage"; commands["classify.seqs"] = "MPIEnabled"; commands["dist.seqs"] = "MPIEnabled"; commands["filter.seqs"] = "MPIEnabled"; @@ -318,6 +322,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "metastats") { command = new MetaStatsCommand(optionString); } else if(commandName == "split.groups") { command = new SplitGroupCommand(optionString); } else if(commandName == "cluster.fragments") { command = new ClusterFragmentsCommand(optionString); } + else if(commandName == "get.lineage") { command = new GetLineageCommand(optionString); } + else if(commandName == "remove.lineage") { command = new RemoveLineageCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/engine.cpp b/engine.cpp index 54524ba..a90bb7e 100644 --- a/engine.cpp +++ b/engine.cpp @@ -571,11 +571,16 @@ string ScriptEngine::getNextCommand(string& commandString) { string nextcommand = ""; int count = 0; + bool ignoreSemiColons = false; //go through string until you reach ; or end while (count < commandString.length()) { - if (commandString[count] == ';') { break; } + //you want to ignore any ; until you reach the next ' + if ((commandString[count] == '\'') && (!ignoreSemiColons)) { ignoreSemiColons = true; } + else if ((commandString[count] == '\'') && (ignoreSemiColons)) { ignoreSemiColons = false; } + + if ((commandString[count] == ';') && (!ignoreSemiColons)) { break; } else { nextcommand += commandString[count]; } count++; diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp new file mode 100644 index 0000000..5ace2a1 --- /dev/null +++ b/getlineagecommand.cpp @@ -0,0 +1,612 @@ +/* + * getlineagecommand.cpp + * Mothur + * + * Created by westcott on 9/24/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "getlineagecommand.h" +#include "sequence.hpp" +#include "listvector.hpp" + +//********************************************************************************************************************** + +GetLineageCommand::GetLineageCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","name", "group", "alignreport", "taxon", "dups", "list","taxonomy","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("alignreport"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["alignreport"] = inputDir + it->second; } + } + + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + + alignfile = validParameter.validFile(parameters, "alignreport", true); + if (alignfile == "not open") { abort = true; } + else if (alignfile == "not found") { alignfile = ""; } + + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not open") { abort = true; } + else if (listfile == "not found") { listfile = ""; } + + taxfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxfile == "not open") { abort = true; } + else if (taxfile == "not found") { taxfile = ""; m->mothurOut("The taxonomy parameter is required for the get.lineage command."); m->mothurOutEndLine(); abort = true; } + + string usedDups = "true"; + string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } + dups = m->isTrue(temp); + + taxons = validParameter.validFile(parameters, "taxon", false); + if (taxons == "not found") { taxons = ""; m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine(); abort = true; } + else { + //rip off quotes + if (taxons[0] == '\'') { taxons = taxons.substr(1); } + if (taxons[(taxons.length()-1)] == '\'') { taxons = taxons.substr(0, (taxons.length()-1)); } + } + + + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } + + } + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "GetLineageCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void GetLineageCommand::help(){ + try { + m->mothurOut("The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"); + m->mothurOut("It outputs a file containing only the sequences from the taxonomy file that are from the taxon requested.\n"); + m->mothurOut("The get.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy and taxon.\n"); + m->mothurOut("The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"); + m->mothurOut("The taxon parameter allows you to select the taxons you would like to get.\n"); + m->mothurOut("The get.lineage command should be in the following format: get.lineage(taxonomy=yourTaxonomyFile, taxon=yourTaxons).\n"); + m->mothurOut("Example get.lineage(taxonomy=amazon.silva.taxonomy, taxon=Bacteria;Firmicutes;Bacilli;Lactobacillales;).\n"); + m->mothurOut("Note: If you are running mothur in script mode you must wrap the taxon in ' characters so mothur will ignore the ; in the taxon.\n"); + m->mothurOut("Example get.lineage(taxonomy=amazon.silva.taxonomy, taxon='Bacteria;Firmicutes;Bacilli;Lactobacillales;').\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +int GetLineageCommand::execute(){ + try { + + if (abort == true) { return 0; } + + if (m->control_pressed) { return 0; } + + //read through the correct file and output lines you want to keep + if (taxfile != "") { readTax(); } //fills the set of names to get + if (namefile != "") { readName(); } + if (fastafile != "") { readFasta(); } + if (groupfile != "") { readGroup(); } + if (alignfile != "") { readAlign(); } + if (listfile != "") { readList(); } + + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + if (outputNames.size() != 0) { + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + } + + return 0; + } + + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +int GetLineageCommand::readFasta(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pick" + m->getExtension(fastafile); + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(fastafile, in); + string name; + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + Sequence currSeq(in); + name = currSeq.getName(); + + if (name != "") { + //if this name is in the accnos file + if (names.count(name) != 0) { + wroteSomething = true; + + currSeq.printSequence(out); + } + } + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readFasta"); + exit(1); + } +} +//********************************************************************************************************************** +int GetLineageCommand::readList(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "pick" + m->getExtension(listfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(listfile, in); + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + //read in list vector + ListVector list(in); + + //make a new list vector + ListVector newList; + newList.setLabel(list.getLabel()); + + //for each bin + for (int i = 0; i < list.getNumBins(); i++) { + + //parse out names that are in accnos file + string binnames = list.get(i); + + string newNames = ""; + while (binnames.find_first_of(',') != -1) { + string name = binnames.substr(0,binnames.find_first_of(',')); + binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); + + //if that name is in the .accnos file, add it + if (names.count(name) != 0) { newNames += name + ","; } + } + + //get last name + if (names.count(binnames) != 0) { newNames += binnames + ","; } + + //if there are names in this bin add to new list + if (newNames != "") { + newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma + newList.push_back(newNames); + } + } + + //print new listvector + if (newList.getNumBins() != 0) { + wroteSomething = true; + newList.print(out); + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readList"); + exit(1); + } +} +//********************************************************************************************************************** +int GetLineageCommand::readName(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pick" + m->getExtension(namefile); + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(namefile, in); + string name, firstCol, secondCol; + + bool wroteSomething = false; + + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> firstCol; + in >> secondCol; + + string hold = ""; + if (dups) { hold = secondCol; } + + vector parsedNames; + //parse second column saving each name + while (secondCol.find_first_of(',') != -1) { + name = secondCol.substr(0,secondCol.find_first_of(',')); + secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); + parsedNames.push_back(name); + } + + //get name after last , + parsedNames.push_back(secondCol); + + vector validSecond; + for (int i = 0; i < parsedNames.size(); i++) { + if (names.count(parsedNames[i]) != 0) { + validSecond.push_back(parsedNames[i]); + } + } + + if ((dups) && (validSecond.size() != 0)) { //dups = true and we want to add someone, then add everyone + for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); } + out << firstCol << '\t' << hold << endl; + wroteSomething = true; + }else { + //if the name in the first column is in the set then print it and any other names in second column also in set + if (names.count(firstCol) != 0) { + + wroteSomething = true; + + out << firstCol << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + + + //make first name in set you come to first column and then add the remaining names to second column + }else { + //you want part of this row + if (validSecond.size() != 0) { + + wroteSomething = true; + + out << validSecond[0] << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + } + } + } + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readName"); + exit(1); + } +} + +//********************************************************************************************************************** +int GetLineageCommand::readGroup(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(groupfile, in); + string name, group; + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + + in >> name; //read from first column + in >> group; //read from second column + + //if this name is in the accnos file + if (names.count(name) != 0) { + wroteSomething = true; + + out << name << '\t' << group << endl; + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readGroup"); + exit(1); + } +} +//********************************************************************************************************************** +int GetLineageCommand::readTax(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(taxfile, in); + string name, tax; + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> name; //read from first column + in >> tax; //read from second column + + string newtax = tax; + + //if the users file contains confidence scores we want to ignore them when searching for the taxons + int hasConfidences = tax.find_first_of('('); + if (hasConfidences != string::npos) { + newtax = removeConfidences(tax); + } + + int pos = newtax.find(taxons); + + if (pos != string::npos) { //this sequence contains the taxon the user wants + names.insert(name); + out << name << '\t' << tax << endl; + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (names.size() == 0) { m->mothurOut("Your taxonomy file does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readTax"); + exit(1); + } +} +/**************************************************************************************************/ +string GetLineageCommand::removeConfidences(string tax) { + try { + + string taxon = ""; + int taxLength = tax.length(); + for(int i=0;ierrorOut(e, "GetLineageCommand", "removeConfidences"); + exit(1); + } +} +//********************************************************************************************************************** +//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name +int GetLineageCommand::readAlign(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(alignfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(alignfile)) + "pick.align.report"; + ofstream out; + m->openOutputFile(outputFileName, out); + + + ifstream in; + m->openInputFile(alignfile, in); + string name, junk; + + bool wroteSomething = false; + + //read column headers + for (int i = 0; i < 16; i++) { + if (!in.eof()) { in >> junk; out << junk << '\t'; } + else { break; } + } + out << endl; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + + in >> name; //read from first column + + //if this name is in the accnos file + if (names.count(name) != 0) { + wroteSomething = true; + + out << name << '\t'; + + //read rest + for (int i = 0; i < 15; i++) { + if (!in.eof()) { in >> junk; out << junk << '\t'; } + else { break; } + } + out << endl; + + }else {//still read just don't do anything with it + //read rest + for (int i = 0; i < 15; i++) { + if (!in.eof()) { in >> junk; } + else { break; } + } + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readAlign"); + exit(1); + } +} +//********************************************************************************************************************** + + + + diff --git a/getlineagecommand.h b/getlineagecommand.h new file mode 100644 index 0000000..0d07ac6 --- /dev/null +++ b/getlineagecommand.h @@ -0,0 +1,40 @@ +#ifndef GETLINEAGECOMMAND_H +#define GETLINEAGECOMMAND_H + +/* + * getlineagecommand.h + * Mothur + * + * Created by westcott on 9/24/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +class GetLineageCommand : public Command { + + public: + + GetLineageCommand(string); + ~GetLineageCommand(){}; + int execute(); + void help(); + + private: + set names; + vector outputNames; + string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + bool abort, dups; + + int readFasta(); + int readName(); + int readGroup(); + int readAlign(); + int readList(); + int readTax(); + string removeConfidences(string); +}; + +#endif + diff --git a/mothur b/mothur index 83351b2..cd70f8b 100755 Binary files a/mothur and b/mothur differ diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp new file mode 100644 index 0000000..d316019 --- /dev/null +++ b/removelineagecommand.cpp @@ -0,0 +1,596 @@ +/* + * removelineagecommand.cpp + * Mothur + * + * Created by westcott on 9/24/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "removelineagecommand.h" +#include "sequence.hpp" +#include "listvector.hpp" + +//********************************************************************************************************************** + +RemoveLineageCommand::RemoveLineageCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","name", "group", "alignreport", "taxon", "dups", "list","taxonomy","outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("alignreport"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["alignreport"] = inputDir + it->second; } + } + + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + } + + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + + alignfile = validParameter.validFile(parameters, "alignreport", true); + if (alignfile == "not open") { abort = true; } + else if (alignfile == "not found") { alignfile = ""; } + + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not open") { abort = true; } + else if (listfile == "not found") { listfile = ""; } + + taxfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxfile == "not open") { abort = true; } + else if (taxfile == "not found") { taxfile = ""; m->mothurOut("The taxonomy parameter is required for the get.lineage command."); m->mothurOutEndLine(); abort = true; } + + string usedDups = "true"; + string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } + dups = m->isTrue(temp); + + taxons = validParameter.validFile(parameters, "taxon", false); + if (taxons == "not found") { taxons = ""; m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine(); abort = true; } + else { + //rip off quotes + if (taxons[0] == '\'') { taxons = taxons.substr(1); } + if (taxons[(taxons.length()-1)] == '\'') { taxons = taxons.substr(0, (taxons.length()-1)); } + } + + + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } + + } + + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void RemoveLineageCommand::help(){ + try { + m->mothurOut("The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"); + m->mothurOut("It outputs a file containing only the sequences from the taxonomy file that are not from the taxon you requested to be removed.\n"); + m->mothurOut("The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy and taxon.\n"); + m->mothurOut("The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"); + m->mothurOut("The taxon parameter allows you to select the taxons you would like to remove.\n"); + m->mothurOut("The remove.lineage command should be in the following format: remove.lineage(taxonomy=yourTaxonomyFile, taxon=yourTaxons).\n"); + m->mothurOut("Example remove.lineage(taxonomy=amazon.silva.taxonomy, taxon=Bacteria;Firmicutes;Bacilli;Lactobacillales;).\n"); + m->mothurOut("Note: If you are running mothur in script mode you must wrap the taxon in ' characters so mothur will ignore the ; in the taxon.\n"); + m->mothurOut("Example remove.lineage(taxonomy=amazon.silva.taxonomy, taxon='Bacteria;Firmicutes;Bacilli;Lactobacillales;').\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +int RemoveLineageCommand::execute(){ + try { + + if (abort == true) { return 0; } + + if (m->control_pressed) { return 0; } + + //read through the correct file and output lines you want to keep + if (taxfile != "") { readTax(); } //fills the set of names to remove + if (namefile != "") { readName(); } + if (fastafile != "") { readFasta(); } + if (groupfile != "") { readGroup(); } + if (alignfile != "") { readAlign(); } + if (listfile != "") { readList(); } + + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + if (outputNames.size() != 0) { + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + } + + return 0; + } + + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +int RemoveLineageCommand::readFasta(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pick" + m->getExtension(fastafile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(fastafile, in); + string name; + + bool wroteSomething = false; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + Sequence currSeq(in); + name = currSeq.getName(); + + if (name != "") { + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + + currSeq.printSequence(out); + } + } + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your fasta file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readFasta"); + exit(1); + } +} +//********************************************************************************************************************** +int RemoveLineageCommand::readList(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(listfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "pick" + m->getExtension(listfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(listfile, in); + + bool wroteSomething = false; + + while(!in.eof()){ + //read in list vector + ListVector list(in); + + //make a new list vector + ListVector newList; + newList.setLabel(list.getLabel()); + + //for each bin + for (int i = 0; i < list.getNumBins(); i++) { + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + //parse out names that are in accnos file + string binnames = list.get(i); + + string newNames = ""; + while (binnames.find_first_of(',') != -1) { + string name = binnames.substr(0,binnames.find_first_of(',')); + binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); + + //if that name is in the .accnos file, add it + if (names.count(name) == 0) { newNames += name + ","; } + } + + //get last name + if (names.count(binnames) == 0) { newNames += binnames + ","; } + + //if there are names in this bin add to new list + if (newNames != "") { + newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma + newList.push_back(newNames); + } + } + + //print new listvector + if (newList.getNumBins() != 0) { + wroteSomething = true; + newList.print(out); + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your list file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readList"); + exit(1); + } +} +//********************************************************************************************************************** +int RemoveLineageCommand::readName(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pick" + m->getExtension(namefile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(namefile, in); + string name, firstCol, secondCol; + + bool wroteSomething = false; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> firstCol; + in >> secondCol; + + vector parsedNames; + //parse second column saving each name + while (secondCol.find_first_of(',') != -1) { + name = secondCol.substr(0,secondCol.find_first_of(',')); + secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); + parsedNames.push_back(name); + } + + //get name after last , + parsedNames.push_back(secondCol); + + vector validSecond; validSecond.clear(); + for (int i = 0; i < parsedNames.size(); i++) { + if (names.count(parsedNames[i]) == 0) { + validSecond.push_back(parsedNames[i]); + } + } + + if ((dups) && (validSecond.size() != parsedNames.size())) { //if dups is true and we want to get rid of anyone, get rid of everyone + for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); } + }else { + //if the name in the first column is in the set then print it and any other names in second column also in set + if (names.count(firstCol) == 0) { + + wroteSomething = true; + + out << firstCol << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + + //make first name in set you come to first column and then add the remaining names to second column + }else { + + //you want part of this row + if (validSecond.size() != 0) { + + wroteSomething = true; + + out << validSecond[0] << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + } + } + } + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your name file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readName"); + exit(1); + } +} + +//********************************************************************************************************************** +int RemoveLineageCommand::readGroup(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(groupfile, in); + string name, group; + + bool wroteSomething = false; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> name; //read from first column + in >> group; //read from second column + + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + out << name << '\t' << group << endl; + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readGroup"); + exit(1); + } +} +//********************************************************************************************************************** +int RemoveLineageCommand::readTax(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile); + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(taxfile, in); + string name, tax; + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> name; //read from first column + in >> tax; //read from second column + + string newtax = tax; + + //if the users file contains confidence scores we want to ignore them when searching for the taxons + int hasConfidences = tax.find_first_of('('); + if (hasConfidences != string::npos) { + newtax = removeConfidences(tax); + } + + int pos = newtax.find(taxons); + + if (pos == string::npos) { + wroteSomething = true; + out << name << '\t' << tax << endl; + }else{ //this sequence contains the taxon the user wants to remove + names.insert(name); + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (!wroteSomething) { m->mothurOut("Your taxonomy file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readTax"); + exit(1); + } +} +/**************************************************************************************************/ +string RemoveLineageCommand::removeConfidences(string tax) { + try { + + string taxon = ""; + int taxLength = tax.length(); + for(int i=0;ierrorOut(e, "RemoveLineageCommand", "removeConfidences"); + exit(1); + } +} +//********************************************************************************************************************** +//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name +int RemoveLineageCommand::readAlign(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(alignfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(alignfile)) + "pick.align.report"; + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(alignfile, in); + string name, junk; + + bool wroteSomething = false; + + //read column headers + for (int i = 0; i < 16; i++) { + if (!in.eof()) { in >> junk; out << junk << '\t'; } + else { break; } + } + out << endl; + + while(!in.eof()){ + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } + + in >> name; //read from first column + + //if this name is in the accnos file + if (names.count(name) == 0) { + wroteSomething = true; + + out << name << '\t'; + + //read rest + for (int i = 0; i < 15; i++) { + if (!in.eof()) { in >> junk; out << junk << '\t'; } + else { break; } + } + out << endl; + + }else {//still read just don't do anything with it + + //read rest + for (int i = 0; i < 15; i++) { + if (!in.eof()) { in >> junk; } + else { break; } + } + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your align file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputNames.push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readAlign"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/removelineagecommand.h b/removelineagecommand.h new file mode 100644 index 0000000..d18fc23 --- /dev/null +++ b/removelineagecommand.h @@ -0,0 +1,40 @@ +#ifndef REMOVELINEAGECOMMAND_H +#define REMOVELINEAGECOMMAND_H + +/* + * removelineagecommand.h + * Mothur + * + * Created by westcott on 9/24/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +class RemoveLineageCommand : public Command { + + public: + + RemoveLineageCommand(string); + ~RemoveLineageCommand(){}; + int execute(); + void help(); + + private: + set names; + vector outputNames; + string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + bool abort, dups; + + int readFasta(); + int readName(); + int readGroup(); + int readAlign(); + int readList(); + int readTax(); + string removeConfidences(string); +}; + +#endif +