From 3504e4e2feeb05aabb0c79aa42cb696522030924 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 8 Jan 2013 12:19:41 -0500 Subject: [PATCH] added dereplicate parameter to chimera.slayer and chimera.persues. added minnumsamples and minpercentsamples to filter.shared. fixed output names in cluster commands. added count parameter to parse.list. improved speed and memory requirements on sens.spec. added debugging code to some phylotree functions. --- chimeraperseuscommand.cpp | 20 +++++- chimeraperseuscommand.h | 2 +- chimeraslayercommand.cpp | 18 +++++- chimeraslayercommand.h | 2 +- chimerauchimecommand.cpp | 4 +- clustercommand.cpp | 2 +- clusterdoturcommand.cpp | 2 +- clustersplitcommand.cpp | 2 +- filtersharedcommand.cpp | 38 ++++++++++- filtersharedcommand.h | 4 +- mgclustercommand.cpp | 2 +- parselistscommand.cpp | 130 +++++++++++++++++++++++-------------- parselistscommand.h | 4 +- phylotree.cpp | 10 +++ sensspeccommand.cpp | 131 +++++++++++++++++++++++++++++++++++++- sensspeccommand.h | 1 + 16 files changed, 304 insertions(+), 68 deletions(-) diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index 1e5f9d4..b4e478c 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -20,6 +20,8 @@ vector ChimeraPerseusCommand::setParameters(){ CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); CommandParameter pcutoff("cutoff", "Number", "", "0.5", "", "", "","",false,false); parameters.push_back(pcutoff); @@ -40,13 +42,14 @@ string ChimeraPerseusCommand::getHelpString(){ try { string helpString = ""; helpString += "The chimera.perseus command reads a fastafile and namefile or countfile and outputs potentially chimeric sequences.\n"; - helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n"; + helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, dereplicate, alpha and beta.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n"; helpString += "The count parameter allows you to provide a count file associated with your fasta file. A count or name file is required. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The alpha parameter .... The default is -5.54. \n"; helpString += "The beta parameter .... The default is 0.33. \n"; helpString += "The cutoff parameter .... The default is 0.50. \n"; @@ -461,6 +464,13 @@ ChimeraPerseusCommand::ChimeraPerseusCommand(string option) { temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.33"; } m->mothurConvert(temp, beta); + + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { + if (groupfile != "") { temp = "false"; } + else { temp = "true"; } + } + dups = m->isTrue(temp); } } catch(exception& e) { @@ -524,7 +534,9 @@ int ChimeraPerseusCommand::execute(){ if (m->control_pressed) { delete ct; delete cparser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } map uniqueNames = cparser->getAllSeqsMap(); - numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + if (!dups) { + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + } delete cparser; m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); @@ -560,7 +572,9 @@ int ChimeraPerseusCommand::execute(){ if (m->control_pressed) { delete parser; for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } map uniqueNames = parser->getAllSeqsMap(); - numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + if (!dups) { + numChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName); + } delete parser; m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index fdece95..b3d6ccf 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -46,7 +46,7 @@ private: linePair(int i, int j) : start(i), end(j) {} }; - bool abort, hasName, hasCount; + bool abort, hasName, hasCount, dups; string fastafile, groupfile, countfile, outputDir, namefile; int processors, alignLength; double cutoff, alpha, beta; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index a29fc82..576fee9 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -31,12 +31,14 @@ vector ChimeraSlayerCommand::setParameters(){ CommandParameter pminbs("minbs", "Number", "", "90", "", "", "","",false,false); parameters.push_back(pminbs); CommandParameter psearch("search", "Multiple", "kmer-blast", "blast", "", "", "","",false,false); parameters.push_back(psearch); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter prealign("realign", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(prealign); CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "","fasta",false,false); parameters.push_back(ptrim); CommandParameter psplit("split", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psplit); CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "","",false,false); parameters.push_back(pnumwanted); CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "","",false,false); parameters.push_back(pdivergence); + CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pdups); CommandParameter pparents("parents", "Number", "", "3", "", "", "","",false,false); parameters.push_back(pparents); CommandParameter pincrement("increment", "Number", "", "5", "", "", "","",false,false); parameters.push_back(pincrement); CommandParameter pblastlocation("blastlocation", "String", "", "", "", "", "","",false,false); parameters.push_back(pblastlocation); @@ -59,7 +61,7 @@ string ChimeraSlayerCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; - helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; + helpString += "The chimera.slayer command parameters are fasta, name, group, template, processors, dereplicate, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, blastlocation and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using reference=self. \n"; helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; @@ -70,6 +72,7 @@ string ChimeraSlayerCommand::getHelpString(){ #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n"; helpString += "The trim parameter allows you to output a new fasta file containing your sequences with the chimeric ones trimmed to include only their longest piece, default=F. \n"; helpString += "The split parameter allows you to check both pieces of non-chimeric sequence for chimeras, thus looking for trimeras and quadmeras. default=F. \n"; helpString += "The window parameter allows you to specify the window size for searching for chimeras, default=50. \n"; @@ -595,6 +598,13 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "15"; } m->mothurConvert(temp, numwanted); + + temp = validParameter.validFile(parameters, "dereplicate", false); + if (temp == "not found") { + if (groupfile != "") { temp = "false"; } + else { temp = "true"; } + } + dups = m->isTrue(temp); blastlocation = validParameter.validFile(parameters, "blastlocation", false); if (blastlocation == "not found") { blastlocation = ""; } @@ -749,8 +759,10 @@ int ChimeraSlayerCommand::execute(){ if (pid == 0) { #endif - totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); - m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + if (!dups) { + totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, trimFastaFileName); + m->mothurOutEndLine(); m->mothurOut(toString(totalChimeras) + " chimera found."); m->mothurOutEndLine(); + } #ifdef USE_MPI } MPI_Barrier(MPI_COMM_WORLD); //make everyone wait diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 6149ef5..0c6ebe2 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -70,7 +70,7 @@ private: int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&, string, map&, bool); #endif - bool abort, realign, trim, trimera, save, hasName, hasCount; + bool abort, realign, trim, trimera, save, hasName, hasCount, dups; string fastafile, groupfile, templatefile, outputDir, search, namefile, countfile, blastlocation; int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 04e1f1d..ce7ce45 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -556,11 +556,11 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { temp = validParameter.validFile(parameters, "skipgaps2", false); if (temp == "not found") { temp = "t"; } skipgaps2 = m->isTrue(temp); - string usedDups = "false"; + temp = validParameter.validFile(parameters, "dereplicate", false); if (temp == "not found") { if (groupfile != "") { temp = "false"; } - else { temp = "true"; usedDups = ""; } + else { temp = "true"; } } dups = m->isTrue(temp); diff --git a/clustercommand.cpp b/clustercommand.cpp index 93e29fd..e4e2062 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -336,10 +336,10 @@ int ClusterCommand::execute(){ map variables; variables["[filename]"] = fileroot; - if (countfile != "") { variables["[tag2]"] = "unique_list"; } variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); + if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { diff --git a/clusterdoturcommand.cpp b/clusterdoturcommand.cpp index 4463896..91eb923 100644 --- a/clusterdoturcommand.cpp +++ b/clusterdoturcommand.cpp @@ -244,10 +244,10 @@ int ClusterDoturCommand::execute(){ map variables; variables["[filename]"] = fileroot; - if (countfile != "") { variables["[tag2]"] = "unique_list"; } variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); + if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index ac5e755..6d3c86f 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -818,10 +818,10 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us map variables; variables["[filename]"] = fileroot; - if (countfile != "") { variables["[tag2]"] = "unique_list"; } variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); + if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { diff --git a/filtersharedcommand.cpp b/filtersharedcommand.cpp index 2899121..413a9a4 100644 --- a/filtersharedcommand.cpp +++ b/filtersharedcommand.cpp @@ -17,6 +17,8 @@ vector FilterSharedCommand::setParameters(){ CommandParameter pminpercent("minpercent", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercent); CommandParameter pminabund("minabund", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminabund); CommandParameter pmintotal("mintotal", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pmintotal); + CommandParameter pminnumsamples("minnumsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminnumsamples); + CommandParameter pminpercentsamples("minpercentsamples", "Number", "", "-1", "", "", "","",false,false,true); parameters.push_back(pminpercentsamples); CommandParameter pmakerare("makerare", "Boolean", "", "T", "", "", "","",false,false,true); parameters.push_back(pmakerare); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -35,12 +37,14 @@ string FilterSharedCommand::getHelpString(){ try { string helpString = ""; helpString += "The filter.shared command is used to remove OTUs based on various critieria.\n"; - helpString += "The filter.shared command parameters are shared, minpercent, minabund, mintotal, makerare, groups and label. You must provide a shared file.\n"; + helpString += "The filter.shared command parameters are shared, minpercent, minabund, mintotal, minnumsamples, minpercentsamples, makerare, groups and label. You must provide a shared file.\n"; helpString += "The groups parameter allows you to specify which of the groups you would like included. The group names are separated by dashes.\n"; helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n"; helpString += "The minabund parameter allows you indicate the minimum abundance required for each sample in a given OTU. If any samples abundance falls below the minimum, the OTU is removed. Default=0\n"; helpString += "The minpercent parameter allows you indicate the minimum relative abundance of an OTU. For example, if the OTUs total abundance across all samples is 8, and the total abundance across all OTUs is 1000, and minpercent=1. The OTU's relative abundance is 0.008, the minimum is 0.01, so the OTU will be removed. Default=0.\n"; + helpString += "The minnumsamples parameter allows you indicate the minimum number of samples present in an OTU. If the number of samples present falls below the minimum, the OTU is removed. Default=0.\n"; + helpString += "The minpercentsamples parameter allows you indicate the minimum percent of sample present in an OTU. For example, if the total number of samples is 10, the number present is 3, and the minpercentsamples=50. The OTU's precent of samples is 0.333, the minimum is 0.50, so the OTU will be removed. Default=0.\n"; helpString += "The mintotal parameter allows you indicate the minimum abundance required for a given OTU. If abundance across all samples falls below the minimum, the OTU is removed. Default=0.\n"; helpString += "The makerare parameter allows you indicate you want the abundances of any removed OTUs to be saved and a new \"rare\" OTU created with its abundances equal to the sum of the OTUs removed. This will preserve the number of reads in your dataset. Default=T\n"; @@ -166,6 +170,11 @@ FilterSharedCommand::FilterSharedCommand(string option) { else { setSomething = true; } m->mothurConvert(temp, minTotal); + temp = validParameter.validFile(parameters, "minnumsamples", false); + if (temp == "not found"){ temp = "-1"; } + else { setSomething = true; } + m->mothurConvert(temp, minSamples); + temp = validParameter.validFile(parameters, "minpercent", false); if (temp == "not found"){ temp = "-1"; } else { setSomething = true; } @@ -173,6 +182,14 @@ FilterSharedCommand::FilterSharedCommand(string option) { m->mothurConvert(temp, minPercent); if (minPercent < 1) {} //already in percent form else { minPercent = minPercent / 100.0; } //user gave us a whole number version so convert to % + + temp = validParameter.validFile(parameters, "minpercentsamples", false); + if (temp == "not found"){ temp = "-1"; } + else { setSomething = true; } + + m->mothurConvert(temp, minPercentSamples); + if (minPercentSamples < 1) {} //already in percent form + else { minPercentSamples = minPercentSamples / 100.0; } //user gave us a whole number version so convert to % temp = validParameter.validFile(parameters, "makerare", false); if (temp == "not found"){ temp = "T"; } makeRare = m->isTrue(temp); @@ -347,6 +364,25 @@ int FilterSharedCommand::processShared(vector& thislookup) if (percent < minPercent) { okay = false; } } + + if (okay && (minSamples != -1)) { + int samples = 0; + for (int j = 0; j < thislookup.size(); j++) { + if (thislookup[j]->getAbundance(i) != 0) { samples++; } + } + if (samples < minSamples) { okay = false; } + } + + if (okay && (minPercentSamples != -0.01)) { + double samples = 0; + double total = thislookup.size(); + for (int j = 0; j < thislookup.size(); j++) { + if (thislookup[j]->getAbundance(i) != 0) { samples++; } + } + double percent = samples / total; + if (percent < minPercentSamples) { okay = false; } + } + //did this OTU pass the filter criteria if (okay) { filteredLabels.push_back(saveBinLabels[i]); diff --git a/filtersharedcommand.h b/filtersharedcommand.h index f97cbf9..a97f96e 100644 --- a/filtersharedcommand.h +++ b/filtersharedcommand.h @@ -38,8 +38,8 @@ private: set labels; //holds labels to be used string groups, label, outputDir, sharedfile; vector Groups, outputNames; - int minAbund, minTotal; - float minPercent; + int minAbund, minTotal, minSamples; + float minPercent, minPercentSamples; int processShared(vector&); diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 0a2f91f..24c4f27 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -271,10 +271,10 @@ int MGClusterCommand::execute(){ map variables; variables["[filename]"] = fileroot; - if (countfile != "") { variables["[tag2]"] = "unique_list"; } variables["[clustertag]"] = tag; string sabundFileName = getOutputFileName("sabund", variables); string rabundFileName = getOutputFileName("rabund", variables); + if (countfile != "") { variables["[tag2]"] = "unique_list"; } string listFileName = getOutputFileName("list", variables); if (countfile == "") { diff --git a/parselistscommand.cpp b/parselistscommand.cpp index 61cb245..3ed65c6 100644 --- a/parselistscommand.cpp +++ b/parselistscommand.cpp @@ -13,7 +13,8 @@ vector ParseListCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(plist); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pgroup); + CommandParameter pcount("count", "InputTypes", "", "", "CountGroup", "CountGroup", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none","",false,false,true); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -31,9 +32,11 @@ vector ParseListCommand::setParameters(){ string ParseListCommand::getHelpString(){ try { string helpString = ""; - helpString += "The parse.list command reads a list and group file and generates a list file for each group in the groupfile. \n"; - helpString += "The parse.list command parameters are list, group and label.\n"; - helpString += "The list and group parameters are required.\n"; + helpString += "The parse.list command reads a list and group or count file and generates a list file for each group in the group or count file. \n"; + helpString += "The parse.list command parameters are list, group, count and label.\n"; + helpString += "The list and group or count parameters are required.\n"; + helpString += "If a count file is provided, mothur assumes the list file contains only unique names.\n"; + helpString += "If a group file is provided, mothur assumes the list file contains all names.\n"; helpString += "The label parameter is used to read specific labels in your input you want to use.\n"; helpString += "The parse.list command should be used in the following format: parse.list(list=yourListFile, group=yourGroupFile, label=yourLabels).\n"; helpString += "Example: parse.list(list=abrecovery.fn.list, group=abrecovery.groups, label=0.03).\n"; @@ -121,11 +124,18 @@ ParseListCommand::ParseListCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + //check for required parameters listfile = validParameter.validFile(parameters, "list", true); @@ -139,28 +149,39 @@ ParseListCommand::ParseListCommand(string option) { } }else { m->setListFile(listfile); } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(listfile); } - groupfile = validParameter.validFile(parameters, "group", true); - if (groupfile == "not open") { abort = true; } - else if (groupfile == "not found") { - groupfile = m->getListFile(); - if (groupfile != "") { - m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); - groupMap = new GroupMap(groupfile); - - int error = groupMap->readMap(); - if (error == 1) { abort = true; } - }else { m->mothurOut("No valid current group file. You must provide a group file."); m->mothurOutEndLine(); abort = true; } - }else { - m->setGroupFile(groupfile); + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not found") { groupfile = ""; groupMap = NULL; } + else if (groupfile == "not open") { abort = true; groupfile = ""; groupMap = NULL; } + else { + m->setGroupFile(groupfile); groupMap = new GroupMap(groupfile); int error = groupMap->readMap(); if (error == 1) { abort = true; } - } - - //do you have all files needed - if ((listfile == "") || (groupfile == "")) { m->mothurOut("You must enter both a listfile and groupfile for the parse.list command. "); m->mothurOutEndLine(); abort = true; } + } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { abort = true; countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + if (!ct.hasGroupInfo()) { + abort = true; + m->mothurOut("[ERROR]: The parse.list command requires group info to be present in your countfile, quitting."); m->mothurOutEndLine(); + } + + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + }else if ((groupfile == "") && (countfile == "")) { + m->mothurOut("[ERROR]: you must provide one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -191,7 +212,10 @@ int ParseListCommand::execute(){ //fill filehandles with neccessary ofstreams int i; ofstream* temp; - vector gGroups = groupMap->getNamesOfGroups(); + vector gGroups; + if (groupfile != "") { gGroups = groupMap->getNamesOfGroups(); } + else { gGroups = ct.getNamesOfGroups(); } + for (i=0; i processedLabels; set userLabels = labels; - input = new InputData(listfile, "list"); - list = input->getListVector(); + InputData input(listfile, "list"); + list = input.getListVector(); string lastLabel = list->getLabel(); if (m->control_pressed) { - delete input; delete list; delete groupMap; - vector gGroups = groupMap->getNamesOfGroups(); + delete list; if (groupfile != "") { delete groupMap; } for (i=0; imothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -221,7 +244,7 @@ int ParseListCommand::execute(){ while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { if (m->control_pressed) { - delete input; delete list; delete groupMap; + delete list; if (groupfile != "") { delete groupMap; } for (i=0; imothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -239,8 +262,7 @@ int ParseListCommand::execute(){ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { string saveLabel = list->getLabel(); - delete list; - list = input->getListVector(lastLabel); //get new list vector to process + list = input.getListVector(lastLabel); //get new list vector to process m->mothurOut(list->getLabel()); m->mothurOutEndLine(); parse(list); @@ -256,11 +278,11 @@ int ParseListCommand::execute(){ lastLabel = list->getLabel(); delete list; - list = input->getListVector(); //get new list vector to process + list = input.getListVector(); //get new list vector to process } if (m->control_pressed) { - delete input; delete groupMap; + if (groupfile != "") { delete groupMap; } for (i=0; imothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -281,7 +303,7 @@ int ParseListCommand::execute(){ } if (m->control_pressed) { - delete input; delete groupMap; + if (groupfile != "") { delete groupMap; } for (i=0; imothurRemove(outputNames[i]); } outputTypes.clear(); return 0; @@ -290,7 +312,7 @@ int ParseListCommand::execute(){ //run last label if you need to if (needToRun == true) { if (list != NULL) { delete list; } - list = input->getListVector(lastLabel); //get new list vector to process + list = input.getListVector(lastLabel); //get new list vector to process m->mothurOut(list->getLabel()); m->mothurOutEndLine(); parse(list); @@ -303,9 +325,7 @@ int ParseListCommand::execute(){ delete it3->second; } - - delete groupMap; - delete input; + if (groupfile != "") { delete groupMap; } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); @@ -357,17 +377,33 @@ int ParseListCommand::parse(ListVector* thisList) { //parse bin into list of sequences in each group for (int j = 0; j < names.size(); j++) { - string group = groupMap->getGroup(names[j]); + if (groupfile != "") { + string group = groupMap->getGroup(names[j]); - if (group == "not found") { m->mothurOut(names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); exit(1); } + if (group == "not found") { m->mothurOut(names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); exit(1); } - itGroup = groupBins.find(group); - if(itGroup == groupBins.end()) { - groupBins[group] = names[j]; //add first name - groupNumBins[group]++; - }else{ //add another name - groupBins[group] = groupBins[group] + "," + names[j]; - } + itGroup = groupBins.find(group); + if(itGroup == groupBins.end()) { + groupBins[group] = names[j]; //add first name + groupNumBins[group]++; + }else{ //add another name + groupBins[group] = groupBins[group] + "," + names[j]; + } + }else{ + vector thisSeqsGroups = ct.getGroups(names[j]); + + for (int k = 0; k < thisSeqsGroups.size(); k++) { + string group = thisSeqsGroups[k]; + itGroup = groupBins.find(group); + if(itGroup == groupBins.end()) { + groupBins[group] = names[j]; //add first name + groupNumBins[group]++; + }else{ //add another name + groupBins[group] = groupBins[group] + "," + names[j]; + } + + } + } } //print parsed bin info to files diff --git a/parselistscommand.h b/parselistscommand.h index b366d28..84abc85 100644 --- a/parselistscommand.h +++ b/parselistscommand.h @@ -40,10 +40,10 @@ private: ListVector* list; GroupMap* groupMap; - InputData* input; + CountTable ct; ofstream out; - string outputDir, listfile, groupfile, label; + string outputDir, listfile, groupfile, label, countfile; set labels; bool abort, allLines; vector outputNames; diff --git a/phylotree.cpp b/phylotree.cpp index 8a7c712..b9bab4e 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -259,6 +259,8 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ //somehow the parent is getting one too many accnos //use print to reassign the taxa id taxon = getNextTaxon(seqTaxonomy, seqName); + + if (m->debug) { m->mothurOut(seqName +'\t' + taxon +'\n'); } if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies.insert(currentNode); } break; } @@ -341,6 +343,9 @@ void PhyloTree::assignHeirarchyIDs(int index){ int counter = 1; for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + + if (m->debug) { m->mothurOut(toString(index) +'\t' + tree[it->second].name +'\n'); } + tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); counter++; tree[it->second].level = tree[index].level + 1; @@ -399,6 +404,8 @@ void PhyloTree::binUnclassified(string file){ } } + if (m->debug) { m->mothurOut("maxLevel = " + toString(maxLevel) +'\n'); } + int copyNodes = copy.size(); //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary @@ -409,11 +416,14 @@ void PhyloTree::binUnclassified(string file){ int level = copy[itLeaf->second].level; int currentNode = itLeaf->second; + + if (m->debug) { m->mothurOut(copy[currentNode].name +'\n'); } //this sequence is unclassified at some levels while(level < maxLevel){ level++; + if (m->debug) { m->mothurOut("level = " + toString(level) +'\n'); } string taxon = "unclassified"; diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index dfb89b9..f61232a 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -211,14 +211,25 @@ SensSpecCommand::SensSpecCommand(string option) { int SensSpecCommand::execute(){ try{ if (abort == true) { if (calledHelp) { return 0; } return 2; } - + + int startTime = time(NULL); + + //create list file with only unique names, saves time and memory by removing redundant names from list file that are not in the distance file. + string newListFile = preProcessList(); + if (newListFile != "") { listFile = newListFile; } + setUpOutput(); outputNames.push_back(sensSpecFileName); outputTypes["sensspec"].push_back(sensSpecFileName); if(format == "phylip") { processPhylip(); } else if(format == "column") { processColumn(); } + //remove temp file if created + if (newListFile != "") { m->mothurRemove(newListFile); } + if (m->control_pressed) { m->mothurRemove(sensSpecFileName); return 0; } - + + m->mothurOut("It took " + toString(time(NULL) - startTime) + " to run sens.spec."); m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); m->mothurOut(sensSpecFileName); m->mothurOutEndLine(); @@ -684,6 +695,122 @@ void SensSpecCommand::outputStatistics(string label, string cutoff){ exit(1); } } +//*************************************************************************************************************** + +string SensSpecCommand::preProcessList(){ + try { + set uniqueNames; + //get unique names from distance file + if (format == "phylip") { + + ifstream phylipFile; + m->openInputFile(distFile, phylipFile); + string numTest; + int pNumSeqs; + phylipFile >> numTest; + + if (!m->isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting."); m->mothurOutEndLine(); exit(1); } + else { + m->mothurConvert(numTest, pNumSeqs); + } + phylipFile >> pNumSeqs; m->gobble(phylipFile); + + string seqName; + double distance; + + for(int i=0;icontrol_pressed) { return ""; } + + phylipFile >> seqName; + uniqueNames.insert(seqName); + + for(int j=0;j> distance; + } + m->gobble(phylipFile); + } + phylipFile.close(); + }else { + ifstream columnFile; + m->openInputFile(distFile, columnFile); + string seqNameA, seqNameB; + double distance; + + while(columnFile){ + if (m->control_pressed) { return ""; } + columnFile >> seqNameA >> seqNameB >> distance; + uniqueNames.insert(seqNameA); uniqueNames.insert(seqNameB); + m->gobble(columnFile); + } + columnFile.close(); + } + + //read list file, if numSeqs > unique names then remove redundant names + string newListFile = listFile + ".temp"; + ofstream out; + m->openOutputFile(newListFile, out); + ifstream in; + m->openInputFile(listFile, in); + + bool wroteSomething = false; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(newListFile); return ""; } + + //read in list vector + ListVector list(in); + + //listfile is already unique + if (list.getNumSeqs() == uniqueNames.size()) { in.close(); out.close(); m->mothurRemove(newListFile); return ""; } + + //make a new list vector + ListVector newList; + newList.setLabel(list.getLabel()); + + //for each bin + for (int i = 0; i < list.getNumBins(); i++) { + + //parse out names that are in accnos file + string binnames = list.get(i); + vector bnames; + m->splitAtComma(binnames, bnames); + + string newNames = ""; + for (int i = 0; i < bnames.size(); i++) { + string name = bnames[i]; + //if that name is in the .accnos file, add it + if (uniqueNames.count(name) != 0) { newNames += name + ","; } + } + + //if there are names in this bin add to new list + if (newNames != "") { + newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma + newList.push_back(newNames); + } + } + + //print new listvector + if (newList.getNumBins() != 0) { + wroteSomething = true; + newList.print(out); + } + + m->gobble(in); + } + in.close(); + out.close(); + + if (wroteSomething) { return newListFile; } + return ""; + } + catch(exception& e) { + m->errorOut(e, "SensSpecCommand", "preProcessList"); + exit(1); + } +} + //*************************************************************************************************************** diff --git a/sensspeccommand.h b/sensspeccommand.h index 73b74e3..7d85a39 100644 --- a/sensspeccommand.h +++ b/sensspeccommand.h @@ -58,6 +58,7 @@ private: int fillSeqPairSet(set&, ListVector*&); int process(map&, string, bool&, string&); int process(set&, string, bool&, string&, int); + string preProcessList(); }; -- 2.39.2