From: Sarah Westcott Date: Tue, 25 Sep 2012 13:18:08 +0000 (-0400) Subject: added count file to get.oturep, pre.cluster, screen.seqs, tree.shared. made remove... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=96dbe925073caefaed6e6db85659c144a806aeb1 added count file to get.oturep, pre.cluster, screen.seqs, tree.shared. made remove.lineage and get.lineage more flexible to remove or get taxons wrapped in quotes if not given in taxon parameter. --- diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index bab7740..2e55673 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -619,7 +619,7 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - string baseTName = taxonomyFileName; + string baseTName = m->getSimpleName(taxonomyFileName); if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); } //set rippedTaxName to diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index 378e8ab..f785c50 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -79,7 +79,7 @@ string ClusterFragmentsCommand::getOutputFileNameTag(string type, string inputNa else { if (type == "fasta") { outputFileName = "fragclust.fasta"; } else if (type == "name") { outputFileName = "fragclust.names"; } - else if (type == "count") { outputFileName = "fragclust.count.table"; } + else if (type == "count") { outputFileName = "fragclust.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index 4c7aefb..94d6682 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -66,7 +66,7 @@ string ConsensusSeqsCommand::getOutputFileNameTag(string type, string inputName= else { if (type == "fasta") { outputFileName = "cons.fasta"; } else if (type == "name") { outputFileName = "cons.names"; } - else if (type == "count") { outputFileName = "cons.count.table"; } + else if (type == "count") { outputFileName = "cons.count_table"; } else if (type == "summary") { outputFileName = "cons.summary"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 75d21c2..fa6fd4f 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -35,7 +35,7 @@ vector CountSeqsCommand::setParameters(){ string CountSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file. You may also provide a group file to get the counts broken down by group.\n"; + helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file. You may also provide a group file to get the counts broken down by group.\n"; helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n"; helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n"; helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n"; @@ -59,7 +59,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ it = outputTypes.find(type); if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } else { - if (type == "counttable") { outputFileName = "count.table"; } + if (type == "counttable") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; diff --git a/counttable.cpp b/counttable.cpp index cd623ec..a79047d 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -52,6 +52,12 @@ int CountTable::createTable(set& n, map& g, set& } } + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + return 0; } catch(exception& e) { @@ -179,7 +185,14 @@ int CountTable::createTable(string namefile, string groupfile, bool createGroup) in.close(); if (error) { m->control_pressed = true; } - if (groupfile != "") { delete groupMap; } + else { //check for zero groups + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + } + if (groupfile != "") { delete groupMap; } return 0; } @@ -243,6 +256,13 @@ int CountTable::readTable(string file) { in.close(); if (error) { m->control_pressed = true; } + else { //check for zero groups + if (hasGroups) { + for (int i = 0; i < totalGroups.size(); i++) { + if (totalGroups[i] == 0) { m->mothurOut("\nRemoving group: " + groups[i] + " because all sequences have been removed.\n"); removeGroup(groups[i]); i--; } + } + } + } return 0; } @@ -457,6 +477,51 @@ int CountTable::addGroup(string groupName) { } } /************************************************************/ +//remove group +int CountTable::removeGroup(string groupName) { + try { + if (hasGroups) { + map::iterator it = indexGroupMap.find(groupName); + if (it == indexGroupMap.end()) { + m->mothurOut("[ERROR]: " + groupName + " is not in your count table. Please correct.\n"); m->control_pressed = true; + }else { + int indexOfGroupToRemove = it->second; + map currentGroupIndex = indexGroupMap; + vector newGroups; + for (int i = 0; i < groups.size(); i++) { + if (groups[i] != groupName) { + newGroups.push_back(groups[i]); + indexGroupMap[groups[i]] = i; + } + } + indexGroupMap.erase(groupName); + groups = newGroups; + totalGroups.erase(totalGroups.begin()+indexOfGroupToRemove); + + for (int i = 0; i < counts.size(); i++) { + int num = counts[i][indexOfGroupToRemove]; + counts[i].erase(counts[i].begin()+indexOfGroupToRemove); + totals[i] -= num; + total -= num; + if (totals[i] == 0) { //your sequences are only from the group we want to remove, then remove you. + counts.erase(counts.begin()+i); + totals.erase(totals.begin()+i); + uniques--; + i--; + } + } + if (groups.size() == 0) { hasGroups = false; } + } + }else { m->mothurOut("[ERROR]: your count table does not contain group information, can not remove group " + groupName + ".\n"); m->control_pressed = true; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "removeGroup"); + exit(1); + } +} +/************************************************************/ //vector of groups for the seq vector CountTable::getGroups(string seqName) { try { @@ -560,7 +625,7 @@ int CountTable::push_back(string seqName) { int CountTable::remove(string seqName) { try { map::iterator it = indexNameMap.find(seqName); - if (it == indexNameMap.end()) { + if (it != indexNameMap.end()) { uniques--; if (hasGroups){ //remove this sequences counts from group totals for (int i = 0; i < totalGroups.size(); i++) { totalGroups[i] -= counts[it->second][i]; counts[it->second][i] = 0; } diff --git a/counttable.h b/counttable.h index b66c4c6..34c941b 100644 --- a/counttable.h +++ b/counttable.h @@ -47,9 +47,11 @@ class CountTable { CountTable() { m = MothurOut::getInstance(); hasGroups = false; total = 0; uniques = 0; } ~CountTable() {} + //reads and creates smart enough to eliminate groups with zero counts int createTable(set&, map&, set&); //seqNames, seqName->group, groupNames int createTable(string, string, bool); //namefile, groupfile, createGroup - int readTable(string); + int readTable(string); + int printTable(string); int printHeaders(ofstream&); int printSeq(ofstream&, string); @@ -60,6 +62,7 @@ class CountTable { int getNumGroups() { return groups.size(); } vector getNamesOfGroups() { return groups; } //returns group names, if no group info vector is blank. int addGroup(string); + int removeGroup(string); int renameSeq(string, string); //used to change name of sequence for use with trees int setAbund(string, string, int); //set abundance number of seqs for that group for that seq diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp index b6f71c8..90a40ce 100644 --- a/deconvolutecommand.cpp +++ b/deconvolutecommand.cpp @@ -57,7 +57,7 @@ string DeconvoluteCommand::getOutputFileNameTag(string type, string inputName="" else { if (type == "fasta") { outputFileName = "unique" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "names"; } - else if (type == "count") { outputFileName = "count.table"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp index 910a872..69f4403 100644 --- a/getgroupscommand.cpp +++ b/getgroupscommand.cpp @@ -72,7 +72,7 @@ string GetGroupsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index 1cd139b..645655d 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -10,6 +10,7 @@ #include "getlineagecommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector GetLineageCommand::setParameters(){ @@ -71,7 +72,7 @@ string GetLineageCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } @@ -438,6 +439,14 @@ int GetLineageCommand::readCount(){ } in.close(); out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -691,15 +700,17 @@ int GetLineageCommand::readTax(){ in >> name; //read from first column in >> tax; //read from second column + string noQuotesTax = m->removeQuotes(tax); + for (int j = 0; j < listOfTaxons.size(); j++) { - string newtax = tax; + string newtax = noQuotesTax; //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them if (!taxonsHasConfidence[j]) { - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = tax; + newtax = noQuotesTax; m->removeConfidences(newtax); } @@ -712,7 +723,7 @@ int GetLineageCommand::readTax(){ break; } }else{//if listOfTaxons[i] has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences == string::npos) { int pos = newtax.find(noConfidenceTaxons[j]); @@ -726,10 +737,10 @@ int GetLineageCommand::readTax(){ }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons //first remove confidences from both and see if the taxonomy exists - string noNewTax = tax; - int hasConfidences = tax.find_first_of('('); + string noNewTax = noQuotesTax; + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = tax; + noNewTax = noQuotesTax; m->removeConfidences(noNewTax); } diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 4967f24..9f4dd54 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -41,9 +41,10 @@ vector GetOTURepCommand::setParameters(){ try { CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); @@ -68,9 +69,9 @@ vector GetOTURepCommand::setParameters(){ string GetOTURepCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n"; + helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n"; helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n"; - helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"; + helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name or count filename is required. \n"; helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"; helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n"; helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n"; @@ -106,6 +107,7 @@ string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){ else { if (type == "fasta") { outputFileName = "rep.fasta"; } else if (type == "name") { outputFileName = "rep.names"; } + else if (type == "count") { outputFileName = "rep.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -123,6 +125,7 @@ GetOTURepCommand::GetOTURepCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand"); @@ -157,6 +160,7 @@ GetOTURepCommand::GetOTURepCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -210,6 +214,14 @@ GetOTURepCommand::GetOTURepCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -245,6 +257,24 @@ GetOTURepCommand::GetOTURepCommand(string option) { if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + hasGroups = false; + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { abort = true; countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + if (ct.hasGroupInfo()) { hasGroups = true; } + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these? //give priority to column, then phylip @@ -261,14 +291,18 @@ GetOTURepCommand::GetOTURepCommand(string option) { }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } - } + } } //check for optional parameter and set defaults @@ -292,15 +326,15 @@ GetOTURepCommand::GetOTURepCommand(string option) { sorted = ""; } - if ((sorted == "group") && (groupfile == "")) { - m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine(); + if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) { + m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine(); sorted = ""; } groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } else { - if (groupfile == "") { + if ((groupfile == "") && (!hasGroups)) { m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine(); abort = true; }else { @@ -340,106 +374,9 @@ int GetOTURepCommand::execute(){ int error; list = NULL; - if (!large) { - //read distance files - if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); } - else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); } - else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } - - readMatrix->setCutoff(cutoff); - - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - }else{ nameMap = NULL; } - - readMatrix->read(nameMap); - - if (m->control_pressed) { delete readMatrix; return 0; } - - list = readMatrix->getListVector(); - - SparseDistanceMatrix* matrix = readMatrix->getDMatrix(); - - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - seqVec = vector(list->size()); - for (int i = 0; i < matrix->seqVec.size(); i++) { - for (int j = 0; j < matrix->seqVec[i].size(); j++) { - if (m->control_pressed) { delete readMatrix; return 0; } - //already added everyone else in row - if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; } - } - } - //add dummy map for unweighted calc - SeqMap dummy; - seqVec.push_back(dummy); - - delete matrix; - delete readMatrix; - delete nameMap; - - if (m->control_pressed) { return 0; } - }else { - //process file and set up indexes - if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } - else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); } - else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } - - formatMatrix->setCutoff(cutoff); - - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - }else{ nameMap = NULL; } - - formatMatrix->read(nameMap); - - if (m->control_pressed) { delete formatMatrix; return 0; } - - list = formatMatrix->getListVector(); - - distFile = formatMatrix->getFormattedFileName(); - - //positions in file where the distances for each sequence begin - //rowPositions[1] = position in file where distance related to sequence 1 start. - rowPositions = formatMatrix->getRowPositions(); - rowPositions.push_back(-1); //dummy row for unweighted calc - - delete formatMatrix; - delete nameMap; - - //openfile for getMap to use - m->openInputFile(distFile, inRow); - - if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; } - } - - - //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix - if (list != NULL) { - vector names; - string binnames; - //map names to rows in sparsematrix - for (int i = 0; i < list->size(); i++) { - names.clear(); - binnames = list->get(i); - - m->splitAtComma(binnames, names); - - for (int j = 0; j < names.size(); j++) { - nameToIndex[names[j]] = i; - } - } - } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); } - + readDist(); - if (m->control_pressed) { - if (large) { inRow.close(); m->mothurRemove(distFile); } - return 0; - } + if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); } return 0; } if (groupfile != "") { //read in group map info. @@ -448,13 +385,18 @@ int GetOTURepCommand::execute(){ if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; } if (Groups.size() != 0) { - SharedUtil* util = new SharedUtil(); + SharedUtil util; vector gNamesOfGroups = groupMap->getNamesOfGroups(); - util->setGroups(Groups, gNamesOfGroups, "getoturep"); + util.setGroups(Groups, gNamesOfGroups, "getoturep"); groupMap->setNamesOfGroups(gNamesOfGroups); - delete util; } - } + }else if (hasGroups) { + if (Groups.size() != 0) { + SharedUtil util; + vector gNamesOfGroups = ct.getNamesOfGroups(); + util.setGroups(Groups, gNamesOfGroups, "getoturep"); + } + } //done with listvector from matrix if (list != NULL) { delete list; } @@ -595,6 +537,11 @@ int GetOTURepCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -608,7 +555,116 @@ int GetOTURepCommand::execute(){ exit(1); } } +//********************************************************************************************************************** +int GetOTURepCommand::readDist() { + try { + + if (!large) { + //read distance files + if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); } + else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); } + else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } + + readMatrix->setCutoff(cutoff); + + NameAssignment* nameMap = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + readMatrix->read(&ct); + } + + if (m->control_pressed) { delete readMatrix; return 0; } + + list = readMatrix->getListVector(); + SparseDistanceMatrix* matrix = readMatrix->getDMatrix(); + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + seqVec = vector(list->size()); + for (int i = 0; i < matrix->seqVec.size(); i++) { + for (int j = 0; j < matrix->seqVec[i].size(); j++) { + if (m->control_pressed) { delete readMatrix; return 0; } + //already added everyone else in row + if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; } + } + } + //add dummy map for unweighted calc + SeqMap dummy; + seqVec.push_back(dummy); + + delete matrix; + delete readMatrix; + delete nameMap; + + if (m->control_pressed) { return 0; } + }else { + //process file and set up indexes + if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); } + else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); } + else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; } + + formatMatrix->setCutoff(cutoff); + + NameAssignment* nameMap = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + readMatrix->read(&ct); + } + + if (m->control_pressed) { delete formatMatrix; return 0; } + + list = formatMatrix->getListVector(); + distFile = formatMatrix->getFormattedFileName(); + + //positions in file where the distances for each sequence begin + //rowPositions[1] = position in file where distance related to sequence 1 start. + rowPositions = formatMatrix->getRowPositions(); + rowPositions.push_back(-1); //dummy row for unweighted calc + + delete formatMatrix; + delete nameMap; + + //openfile for getMap to use + m->openInputFile(distFile, inRow); + + if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; } + } + + + //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix + if (list != NULL) { + vector names; + string binnames; + //map names to rows in sparsematrix + for (int i = 0; i < list->size(); i++) { + names.clear(); + binnames = list->get(i); + + m->splitAtComma(binnames, names); + + for (int j = 0; j < names.size(); j++) { + nameToIndex[names[j]] = i; + } + } + } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); } + if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetOTURepCommand", "execute"); + exit(1); + } +} //********************************************************************************************************************** void GetOTURepCommand::readNamesFile() { try { @@ -677,32 +733,38 @@ void GetOTURepCommand::readNamesFile(bool w) { } } //********************************************************************************************************************** -string GetOTURepCommand::findRep(vector names) { +string GetOTURepCommand::findRep(vector names, string group) { try{ // if only 1 sequence in bin or processing the "unique" label, then // the first sequence of the OTU is the representative one if ((names.size() == 1)) { return names[0]; }else{ - vector seqIndex(names.size()); - vector max_dist(names.size()); - vector total_dist(names.size()); + vector seqIndex; //(names.size()); map::iterator itNameFile; map::iterator itNameIndex; //fill seqIndex and initialize sums for (size_t i = 0; i < names.size(); i++) { if (weighted) { - seqIndex[i] = nameToIndex[names[i]]; + seqIndex.push_back(nameToIndex[names[i]]); + if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone. + int numRep = 0; + if (group != "") { numRep = ct.getGroupCount(names[i], group); } + else { numRep = ct.getGroupCount(names[i]); } + for (int j = 1; j < numRep; j++) { //don't add yourself again + seqIndex.push_back(nameToIndex[names[i]]); + } + } }else { if (namefile == "") { itNameIndex = nameToIndex.find(names[i]); if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique - if (large) { seqIndex[i] = (rowPositions.size()-1); } - else { seqIndex[i] = (seqVec.size()-1); } + if (large) { seqIndex.push_back((rowPositions.size()-1)); } + else { seqIndex.push_back((seqVec.size()-1)); } }else { - seqIndex[i] = itNameIndex->second; + seqIndex.push_back(itNameIndex->second); } }else { @@ -715,17 +777,18 @@ string GetOTURepCommand::findRep(vector names) { string name2 = itNameFile->second; if (name1 == name2) { //then you are unique so add your real dists - seqIndex[i] = nameToIndex[names[i]]; + seqIndex.push_back(nameToIndex[names[i]]); }else { //add dummy - if (large) { seqIndex[i] = (rowPositions.size()-1); } - else { seqIndex[i] = (seqVec.size()-1); } + if (large) { seqIndex.push_back((rowPositions.size()-1)); } + else { seqIndex.push_back((seqVec.size()-1)); } } } } } - max_dist[i] = 0.0; - total_dist[i] = 0.0; } + + vector max_dist(seqIndex.size(), 0.0); + vector total_dist(seqIndex.size(), 0.0); // loop through all entries in seqIndex SeqMap::iterator it; @@ -795,19 +858,33 @@ int GetOTURepCommand::process(ListVector* processList) { map filehandles; if (Groups.size() == 0) { //you don't want to use groups - outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + getOutputFileNameTag("name"); - m->openOutputFile(outputNamesFile, newNamesOutput); - outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "."; + if (countfile == "") { + outputNamesFile += getOutputFileNameTag("name"); + outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + }else { + outputNamesFile += getOutputFileNameTag("count"); + outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); + } outputNameFiles[outputNamesFile] = processList->getLabel(); + m->openOutputFile(outputNamesFile, newNamesOutput); + newNamesOutput << "noGroup" << endl; }else{ //you want to use groups ofstream* temp; for (int i=0; igetRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "." + getOutputFileNameTag("name"); + outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + "."; + if (countfile == "") { + outputNamesFile += getOutputFileNameTag("name"); + outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + }else { + outputNamesFile += getOutputFileNameTag("count"); + outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile); + } m->openOutputFile(outputNamesFile, *(temp)); - outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); + *(temp) << Groups[i] << endl; outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i]; } } @@ -832,7 +909,7 @@ int GetOTURepCommand::process(ListVector* processList) { m->splitAtComma(temp, namesInBin); if (Groups.size() == 0) { - nameRep = findRep(namesInBin); + nameRep = findRep(namesInBin, ""); newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl; }else{ map > NamesInGroup; @@ -841,20 +918,25 @@ int GetOTURepCommand::process(ListVector* processList) { } for (int j=0; jgetGroup(namesInBin[j]); - - if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } - - if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group - NamesInGroup[thisgroup].push_back(namesInBin[j]); - } + if (groupfile != "") { + string thisgroup = groupMap->getGroup(namesInBin[j]); + if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + + //add this name to correct group + if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); } + }else { + vector thisSeqsGroups = ct.getGroups(namesInBin[j]); + for (int k = 0; k < thisSeqsGroups.size(); k++) { + if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); } + } + } } //get rep for each group in otu for (int j=0; jopenOutputFile(tempNameFile, out2); - + ifstream in; m->openInputFile(filename, in); int i = 0; + string tempGroup = ""; + in >> tempGroup; m->gobble(in); + + CountTable thisCt; + if (countfile != "") { + thisCt.readTable(countfile); + if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; } + } + + int thistotal = 0; while (!in.eof()) { string rep, binnames; in >> i >> rep >> binnames; m->gobble(in); - out2 << rep << '\t' << binnames << endl; vector names; m->splitAtComma(binnames, names); int binsize = names.size(); - + + if (countfile == "") { out2 << rep << '\t' << binnames << endl; } + else { + if (tempGroup == "noGroup") { + for (int j = 0; j < names.size(); j++) { + if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); } + } + binsize = thisCt.getNumSeqs(rep); + }else { + int total = 0; + for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); } + out2 << rep << '\t' << total << '\t' << total << endl; + binsize = total; + } + } + thistotal += binsize; //if you have a groupfile string group = ""; + map groups; + map::iterator groupIt; if (groupfile != "") { - map groups; - map::iterator groupIt; - //find the groups that are in this bin - for (size_t i = 0; i < names.size(); i++) { + for (int i = 0; i < names.size(); i++) { string groupName = groupMap->getGroup(names[i]); if (groupName == "not found") { m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine(); @@ -937,7 +1042,21 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { } //rip off last dash group = group.substr(0, group.length()-1); - }else{ group = ""; } + }else if (hasGroups) { + map groups; + for (int i = 0; i < names.size(); i++) { + vector thisSeqsGroups = ct.getGroups(names[i]); + for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; } + } + //turn the groups into a string + for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { + group += groupIt->first + "-"; + } + //rip off last dash + group = group.substr(0, group.length()-1); + //cout << group << endl; + } + else{ group = ""; } //print out name and sequence for that bin @@ -947,7 +1066,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { if (sorted == "") { //print them out rep = rep + "\t" + toString(i+1); rep = rep + "|" + toString(binsize); - if (groupfile != "") { + if (group != "") { rep = rep + "|" + group; } out << ">" << rep << endl; @@ -973,7 +1092,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { string sequence = fasta->getSequence(reps[i].name); string outputName = reps[i].name + "\t" + toString(reps[i].bin); outputName = outputName + "|" + toString(reps[i].size); - if (groupfile != "") { + if (reps[i].group != "") { outputName = outputName + "|" + reps[i].group; } out << ">" << outputName << endl; @@ -984,9 +1103,11 @@ int GetOTURepCommand::processFastaNames(string filename, string label) { in.close(); out.close(); out2.close(); - + m->mothurRemove(filename); rename(tempNameFile.c_str(), filename.c_str()); + + if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } return 0; @@ -1012,10 +1133,35 @@ int GetOTURepCommand::processNames(string filename, string label) { int i = 0; string rep, binnames; + + string tempGroup = ""; + in >> tempGroup; m->gobble(in); + + CountTable thisCt; + if (countfile != "") { + thisCt.readTable(countfile); + if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; } + } + while (!in.eof()) { if (m->control_pressed) { break; } in >> i >> rep >> binnames; m->gobble(in); - out2 << rep << '\t' << binnames << endl; + + if (countfile == "") { out2 << rep << '\t' << binnames << endl; } + else { + vector names; + m->splitAtComma(binnames, names); + if (tempGroup == "noGroup") { + for (int j = 0; j < names.size(); j++) { + if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); } + } + }else { + int total = 0; + for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); } + out2 << rep << '\t' << total << '\t' << total << endl; + } + } + } in.close(); out2.close(); @@ -1023,6 +1169,8 @@ int GetOTURepCommand::processNames(string filename, string label) { m->mothurRemove(filename); rename(tempNameFile.c_str(), filename.c_str()); + if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); } + return 0; } catch(exception& e) { diff --git a/getoturepcommand.h b/getoturepcommand.h index d19a396..3906329 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -18,6 +18,7 @@ #include "groupmap.h" #include "readmatrix.hpp" #include "formatmatrix.h" +#include "counttable.h" typedef map SeqMap; @@ -60,10 +61,11 @@ private: ReadMatrix* readMatrix; FormatMatrix* formatMatrix; NameAssignment* nameMap; - string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format, outputDir, groups; + CountTable ct; + string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, countfile, columnfile, distFile, format, outputDir, groups; ofstream out; ifstream in, inNames, inRow; - bool abort, allLines, groupError, large, weighted; + bool abort, allLines, groupError, large, weighted, hasGroups; set labels; //holds labels to be used map nameToIndex; //maps sequence name to index in sparsematrix map nameFileMap; @@ -79,9 +81,10 @@ private: void readNamesFile(bool); int process(ListVector*); SeqMap getMap(int); - string findRep(vector); // returns the name of the "representative" sequence of given bin or subset of a bin, for groups + string findRep(vector, string); // returns the name of the "representative" sequence of given bin or subset of a bin, for groups int processNames(string, string); int processFastaNames(string, string); + int readDist(); }; #endif diff --git a/getseqscommand.cpp b/getseqscommand.cpp index e0faef4..6b16111 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -10,6 +10,7 @@ #include "getseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector GetSeqsCommand::setParameters(){ @@ -90,7 +91,7 @@ string GetSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } @@ -568,6 +569,13 @@ int GetSeqsCommand::readCount(){ } in.close(); out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -617,19 +625,16 @@ int GetSeqsCommand::readList(){ //parse out names that are in accnos file string binnames = list.get(i); + vector bnames; + m->splitAtComma(binnames, bnames); string newNames = ""; - while (binnames.find_first_of(',') != -1) { - string name = binnames.substr(0,binnames.find_first_of(',')); - binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); - + for (int i = 0; i < bnames.size(); i++) { + string name = bnames[i]; //if that name is in the .accnos file, add it if (names.count(name) != 0) { newNames += name + ","; selectedCount++; if (m->debug) { sanity["list"].insert(name); } } } - //get last name - if (names.count(binnames) != 0) { newNames += binnames + ","; selectedCount++; if (m->debug) { sanity["list"].insert(binnames); } } - //if there are names in this bin add to new list if (newNames != "") { newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp index 9e8d3e3..68e70ee 100644 --- a/makebiomcommand.cpp +++ b/makebiomcommand.cpp @@ -549,15 +549,16 @@ vector MakeBiomCommand::getMetaData(vector& lookup) if (m->control_pressed) { return metadata; } //if there is a bin label use it otherwise make one - string binLabel = binTag; - string sbinNumber = otuLabels[i]; - if (sbinNumber.length() < snumBins.length()) { - int diff = snumBins.length() - sbinNumber.length(); - for (int h = 0; h < diff; h++) { binLabel += "0"; } - } - binLabel += sbinNumber; - - labelTaxMap[binLabel] = taxs[i]; + if (m->isContainingOnlyDigits(otuLabels[i])) { + string binLabel = binTag; + string sbinNumber = otuLabels[i]; + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { binLabel += "0"; } + } + binLabel += sbinNumber; + labelTaxMap[binLabel] = taxs[i]; + }else { labelTaxMap[otuLabels[i]] = taxs[i]; } } diff --git a/mothurout.cpp b/mothurout.cpp index d9df5a0..c5eb0dc 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -2890,6 +2890,28 @@ int MothurOut::removeConfidences(string& tax) { } } /**************************************************************************************************/ +string MothurOut::removeQuotes(string tax) { + try { + + string taxon; + string newTax = ""; + + for (int i = 0; i < tax.length(); i++) { + + if (control_pressed) { return newTax; } + + if ((tax[i] != '\'') && (tax[i] != '\"')) { newTax += tax[i]; } + + } + + return newTax; + } + catch(exception& e) { + errorOut(e, "MothurOut", "removeQuotes"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/mothurout.h b/mothurout.h index 3338403..0c2e448 100644 --- a/mothurout.h +++ b/mothurout.h @@ -140,6 +140,7 @@ class MothurOut { void splitAtChar(string&, vector&, char); void splitAtChar(string&, string&, char); int removeConfidences(string&); + string removeQuotes(string); string makeList(vector&); bool isSubset(vector, vector); //bigSet, subset diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 951b200..dadc918 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -14,8 +14,9 @@ vector PreClusterCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -36,9 +37,10 @@ string PreClusterCommand::getHelpString(){ string helpString = ""; helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n"; helpString += "The pre.cluster command outputs a new fasta and name file.\n"; - helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n"; - helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; + helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n"; + helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"; helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n"; + helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n"; helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n"; helpString += "The pre.cluster command should be in the following format: \n"; helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n"; @@ -63,6 +65,7 @@ string PreClusterCommand::getOutputFileNameTag(string type, string inputName="") else { if (type == "fasta") { outputFileName = "precluster" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "precluster.names"; } + else if (type == "count") { outputFileName = "precluster.count_table"; } else if (type == "map") { outputFileName = "precluster.map"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -81,6 +84,7 @@ PreClusterCommand::PreClusterCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["map"] = tempOutNames; } catch(exception& e) { @@ -117,6 +121,7 @@ PreClusterCommand::PreClusterCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["map"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -146,6 +151,14 @@ PreClusterCommand::PreClusterCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -175,6 +188,25 @@ PreClusterCommand::PreClusterCommand(string option) { if (groupfile == "not found") { groupfile = ""; bygroup = false; } else if (groupfile == "not open") { abort = true; groupfile = ""; } else { m->setGroupFile(groupfile); bygroup = true; } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not found") { countfile = ""; } + else if (countfile == "not open") { abort = true; countfile = ""; } + else { + m->setCountTableFile(countfile); + ct.readTable(countfile); + if (ct.hasGroupInfo()) { bygroup = true; } + else { bygroup = false; } + } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } m->mothurConvert(temp, diffs); @@ -183,10 +215,12 @@ PreClusterCommand::PreClusterCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -207,10 +241,11 @@ int PreClusterCommand::execute(){ string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile)); string newFastaFile = fileroot + getOutputFileNameTag("fasta", fastafile); string newNamesFile = fileroot + getOutputFileNameTag("name"); + string newCountFile = fileroot + getOutputFileNameTag("count"); string newMapFile = fileroot + getOutputFileNameTag("map"); //add group name if by group outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile); - outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); - + if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); } + else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); } if (bygroup) { //clear out old files @@ -219,39 +254,45 @@ int PreClusterCommand::execute(){ newMapFile = fileroot + "precluster."; //parse fasta and name file by group - SequenceParser* parser; - if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); } - else { parser = new SequenceParser(groupfile, fastafile); } - - vector groups = parser->getNamesOfGroups(); - - if(processors == 1) { driverGroups(parser, newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); } - else { createProcessesGroups(parser, newFastaFile, newNamesFile, newMapFile, groups); } - - delete parser; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - //run unique.seqs for deconvolute results - string inputString = "fasta=" + newFastaFile; - if (namefile != "") { inputString += ", name=" + newNamesFile; } - m->mothurOutEndLine(); - m->mothurOut("/******************************************/"); m->mothurOutEndLine(); - m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); - m->mothurCalling = true; + vector groups; + if (countfile != "") { + cparser = new SequenceCountParser(countfile, fastafile); + groups = cparser->getNamesOfGroups(); + }else { + if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); } + else { parser = new SequenceParser(groupfile, fastafile); } + groups = parser->getNamesOfGroups(); + } - Command* uniqueCommand = new DeconvoluteCommand(inputString); - uniqueCommand->execute(); - - map > filenames = uniqueCommand->getOutputFiles(); - - delete uniqueCommand; - m->mothurCalling = false; - m->mothurOut("/******************************************/"); m->mothurOutEndLine(); - - m->renameFile(filenames["fasta"][0], newFastaFile); - m->renameFile(filenames["name"][0], newNamesFile); - + if(processors == 1) { driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); } + else { createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups); } + + if (countfile != "") { + mergeGroupCounts(newCountFile, newNamesFile, newFastaFile); + delete cparser; + }else { + delete parser; + //run unique.seqs for deconvolute results + string inputString = "fasta=" + newFastaFile; + if (namefile != "") { inputString += ", name=" + newNamesFile; } + m->mothurOutEndLine(); + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + m->mothurCalling = true; + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + m->mothurCalling = false; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + m->renameFile(filenames["fasta"][0], newFastaFile); + m->renameFile(filenames["name"][0], newNamesFile); + } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); }else { @@ -272,8 +313,9 @@ int PreClusterCommand::execute(){ m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); - printData(newFastaFile, newNamesFile); - + if (countfile != "") { newNamesFile = newCountFile; } + printData(newFastaFile, newNamesFile, ""); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); } @@ -295,6 +337,11 @@ int PreClusterCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } return 0; @@ -305,7 +352,7 @@ int PreClusterCommand::execute(){ } } /**************************************************************************************************/ -int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newFName, string newNName, string newMFile, vector groups) { +int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector groups) { try { vector processIDS; @@ -336,7 +383,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF process++; }else if (pid == 0){ outputNames.clear(); - num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups); + num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups); string tempFile = toString(getpid()) + ".outputNames.temp"; ofstream outTemp; @@ -355,7 +402,7 @@ int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newF } //do my part - num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups); + num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups); //force parent to wait until all the processes are done for (int i=0;i groups){ +int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector groups){ try { int numSeqs = 0; @@ -458,24 +505,29 @@ int PreClusterCommand::driverGroups(SequenceParser* parser, string newFFile, str m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine(); map thisNameMap; - if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); } - vector thisSeqs = parser->getSeqs(groups[i]); - + vector thisSeqs; + if (groupfile != "") { + thisSeqs = parser->getSeqs(groups[i]); + }else if (countfile != "") { + thisSeqs = cparser->getSeqs(groups[i]); + } + if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); } + //fill alignSeqs with this groups info. - numSeqs = loadSeqs(thisNameMap, thisSeqs); + numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]); if (m->control_pressed) { return 0; } if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } - int count = process(newMFile+groups[i]+".map"); + int count= process(newMFile+groups[i]+".map"); outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map"); if (m->control_pressed) { return 0; } m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine(); m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); - printData(newFFile, newNFile); + printData(newFFile, newNFile, groups[i]); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); @@ -559,26 +611,13 @@ int PreClusterCommand::readFASTA(){ //ifstream inNames; ifstream inFasta; - //m->openInputFile(namefile, inNames); m->openInputFile(fastafile, inFasta); - - //string firstCol, secondCol, nameString; set lengths; while (!inFasta.eof()) { if (m->control_pressed) { inFasta.close(); return 0; } - - //inNames >> firstCol >> secondCol; - //nameString = secondCol; - - //m->gobble(inNames); - //int size = 1; - //while (secondCol.find_first_of(',') != -1) { - // size++; - // secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); - //} - + Sequence seq(inFasta); m->gobble(inFasta); if (seq.getName() != "") { //can get "" if commented line is at end of fasta file @@ -592,14 +631,15 @@ int PreClusterCommand::readFASTA(){ lengths.insert(seq.getAligned().length()); } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, seq, seq.getName()); + int numRep = 1; + if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); } + seqPNode tempNode(numRep, seq, seq.getName()); alignSeqs.push_back(tempNode); lengths.insert(seq.getAligned().length()); } } } inFasta.close(); - //inNames.close(); if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } else if (lengths.size() == 1) { length = *(lengths.begin()); } @@ -613,13 +653,15 @@ int PreClusterCommand::readFASTA(){ } } /**************************************************************************************************/ -int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs){ +int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs, string group){ try { set lengths; alignSeqs.clear(); map::iterator it; bool error = false; - + map thisCount; + if (countfile != "") { thisCount = cparser->getCountTable(group); } + for (int i = 0; i < thisSeqs.size(); i++) { if (m->control_pressed) { return 0; } @@ -641,12 +683,20 @@ int PreClusterCommand::loadSeqs(map& thisName, vector& lengths.insert(thisSeqs[i].getAligned().length()); } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); + int numRep = 1; + if (countfile != "") { + map::iterator it2 = thisCount.find(thisSeqs[i].getName()); + + //should never be true since parser checks for this + if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; } + else { numRep = it2->second; } + } + seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName()); alignSeqs.push_back(tempNode); lengths.insert(thisSeqs[i].getAligned().length()); } } - + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } else if (lengths.size() == 1) { length = *(lengths.begin()); } @@ -683,10 +733,84 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ exit(1); } } +/**************************************************************************************************/ + +int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){ + try { + ifstream inNames; + m->openInputFile(newname, inNames); + + string group, first, second; + set uniqueNames; + while (!inNames.eof()) { + if (m->control_pressed) { break; } + inNames >> group; m->gobble(inNames); + inNames >> first; m->gobble(inNames); + inNames >> second; m->gobble(inNames); + + vector names; + m->splitAtComma(second, names); + + uniqueNames.insert(first); + + int total = ct.getGroupCount(first, group); + for (int i = 1; i < names.size(); i++) { + total += ct.getGroupCount(names[i], group); + ct.setAbund(names[i], group, 0); + } + ct.setAbund(first, group, total); + } + inNames.close(); + + vector namesOfSeqs = ct.getNamesOfSeqs(); + for (int i = 0; i < namesOfSeqs.size(); i++) { + if (ct.getNumSeqs(namesOfSeqs[i]) == 0) { + ct.remove(namesOfSeqs[i]); + } + } + + ct.printTable(newcount); + m->mothurRemove(newname); + + if (bygroup) { //if by group, must remove the duplicate seqs that are named the same + ifstream in; + m->openInputFile(newfasta, in); + + ofstream out; + m->openOutputFile(newfasta+"temp", out); + + int count = 0; + set already; + while(!in.eof()) { + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + count++; + if (already.count(seq.getName()) == 0) { + seq.printSequence(out); + already.insert(seq.getName()); + } + } + } + in.close(); + out.close(); + m->mothurRemove(newfasta); + m->renameFile(newfasta+"temp", newfasta); + } + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PreClusterCommand", "mergeGroupCounts"); + exit(1); + } +} /**************************************************************************************************/ -void PreClusterCommand::printData(string newfasta, string newname){ +void PreClusterCommand::printData(string newfasta, string newname, string group){ try { ofstream outFasta; ofstream outNames; @@ -699,10 +823,14 @@ void PreClusterCommand::printData(string newfasta, string newname){ m->openOutputFile(newname, outNames); } + if ((countfile != "") && (group == "")) { outNames << "Representative_Sequence\ttotal\n"; } for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (countfile != "") { + if (group != "") { outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } + else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl; } + }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } } } diff --git a/preclustercommand.h b/preclustercommand.h index 084bdc6..9844b8f 100644 --- a/preclustercommand.h +++ b/preclustercommand.h @@ -15,6 +15,7 @@ #include "command.hpp" #include "sequence.hpp" #include "sequenceparser.h" +#include "sequencecountparser.h" /************************************************************/ struct seqPNode { @@ -28,7 +29,13 @@ struct seqPNode { ~seqPNode() {} }; /************************************************************/ -inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); } +inline bool comparePriority(seqPNode first, seqPNode second) { + if (first.numIdentical > second.numIdentical) { return true; } + else if (first.numIdentical == second.numIdentical) { + if (first.seq.getName() > second.seq.getName()) { return true; } + } + return false; +} //************************************************************/ class PreClusterCommand : public Command { @@ -58,9 +65,13 @@ private: linePair(int i, int j) : start(i), end(j) {} }; + SequenceParser* parser; + SequenceCountParser* cparser; + CountTable ct; + int diffs, length, processors; bool abort, bygroup; - string fastafile, namefile, outputDir, groupfile; + string fastafile, namefile, outputDir, groupfile, countfile; vector alignSeqs; //maps the number of identical seqs to a sequence map names; //represents the names file first column maps to second column map sizes; //this map a seq name to the number of identical seqs in the names file @@ -73,11 +84,12 @@ private: void readNameFile(); //int readNamesFASTA(); int calcMisMatches(string, string); - void printData(string, string); //fasta filename, names file name + void printData(string, string, string); //fasta filename, names file name int process(string); - int loadSeqs(map&, vector&); - int driverGroups(SequenceParser*, string, string, string, int, int, vector groups); - int createProcessesGroups(SequenceParser*, string, string, string, vector); + int loadSeqs(map&, vector&, string); + int driverGroups(string, string, string, int, int, vector groups); + int createProcessesGroups(string, string, string, vector); + int mergeGroupCounts(string, string, string); }; /**************************************************************************************************/ @@ -87,7 +99,7 @@ private: struct preClusterData { string fastafile; string namefile; - string groupfile; + string groupfile, countfile; string newFName, newNName, newMName; MothurOut* m; int start; @@ -97,7 +109,7 @@ struct preClusterData { vector mapFileNames; preClusterData(){} - preClusterData(string f, string n, string g, string nff, string nnf, string nmf, vector gr, MothurOut* mout, int st, int en, int d, int tid) { + preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector gr, MothurOut* mout, int st, int en, int d, int tid) { fastafile = f; namefile = n; groupfile = g; @@ -110,6 +122,7 @@ struct preClusterData { diffs = d; threadID = tid; groups = gr; + countfile = c; } }; @@ -124,10 +137,15 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ //parse fasta and name file by group SequenceParser* parser; - if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } - else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } - - int numSeqs = 0; + SequenceCountParser* cparser; + if (pDataArray->countfile != "") { + cparser = new SequenceCountParser(pDataArray->countfile, pDataArray->fastafile); + }else { + if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); } + else { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile); } + } + + int numSeqs = 0; vector alignSeqs; //clear out old files ofstream outF; pDataArray->m->openOutputFile(pDataArray->newFName, outF); outF.close(); @@ -143,8 +161,13 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine(); map thisNameMap; - if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); } - vector thisSeqs = parser->getSeqs(pDataArray->groups[k]); + vector thisSeqs; + if (pDataArray->groupfile != "") { + thisSeqs = parser->getSeqs(pDataArray->groups[k]); + }else if (pDataArray->countfile != "") { + thisSeqs = cparser->getSeqs(pDataArray->groups[k]); + } + if (pDataArray->namefile != "") { thisNameMap = parser->getNameMap(pDataArray->groups[k]); } //fill alignSeqs with this groups info. //////////////////////////////////////////////////// @@ -154,6 +177,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ alignSeqs.clear(); map::iterator it; bool error = false; + map thisCount; + if (pDataArray->countfile != "") { thisCount = cparser->getCountTable(pDataArray->groups[k]); } + for (int i = 0; i < thisSeqs.size(); i++) { @@ -176,8 +202,16 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } } }else { //no names file, you are identical to yourself - seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); - alignSeqs.push_back(tempNode); + int numRep = 1; + if (pDataArray->countfile != "") { + map::iterator it2 = thisCount.find(thisSeqs[i].getName()); + + //should never be true since parser checks for this + if (it2 == thisCount.end()) { pDataArray->m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); error = true; } + else { numRep = it2->second; } + } + seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName()); + alignSeqs.push_back(tempNode); if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } } } @@ -274,7 +308,9 @@ static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){ for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + if (pDataArray->countfile != "") { outNames << pDataArray->groups[k] << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; + }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } + } } diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp index 80aeb3f..a29906c 100644 --- a/removegroupscommand.cpp +++ b/removegroupscommand.cpp @@ -71,7 +71,7 @@ string RemoveGroupsCommand::getOutputFileNameTag(string type, string inputName=" else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index a3fd1b8..2b930b5 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -10,6 +10,7 @@ #include "removelineagecommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector RemoveLineageCommand::setParameters(){ @@ -73,7 +74,7 @@ string RemoveLineageCommand::getOutputFileNameTag(string type, string inputName= else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else if (type == "alignreport") { outputFileName = "pick.align.report"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -585,6 +586,13 @@ int RemoveLineageCommand::readCount(){ } in.close(); out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } if (wroteSomething == false) { m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); @@ -678,15 +686,17 @@ int RemoveLineageCommand::readTax(){ bool remove = false; + string noQuotesTax = m->removeQuotes(tax); + for (int j = 0; j < listOfTaxons.size(); j++) { - string newtax = tax; + string newtax = noQuotesTax; //if the users file contains confidence scores we want to ignore them when searching for the taxons, unless the taxon has them if (!taxonsHasConfidence[j]) { - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - newtax = tax; + newtax = noQuotesTax; m->removeConfidences(newtax); } @@ -701,7 +711,7 @@ int RemoveLineageCommand::readTax(){ } }else{//if taxons has them and you don't them remove taxons - int hasConfidences = tax.find_first_of('('); + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences == string::npos) { int pos = newtax.find(noConfidenceTaxons[j]); @@ -716,10 +726,10 @@ int RemoveLineageCommand::readTax(){ }else { //both have confidences so we want to make sure the users confidences are greater then or equal to the taxons //first remove confidences from both and see if the taxonomy exists - string noNewTax = tax; - int hasConfidences = tax.find_first_of('('); + string noNewTax = noQuotesTax; + int hasConfidences = noQuotesTax.find_first_of('('); if (hasConfidences != string::npos) { - noNewTax = tax; + noNewTax = noQuotesTax; m->removeConfidences(noNewTax); } diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 1fb9446..73873f3 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -10,6 +10,7 @@ #include "removeseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** vector RemoveSeqsCommand::setParameters(){ @@ -71,7 +72,7 @@ string RemoveSeqsCommand::getOutputFileNameTag(string type, string inputName="") else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } - else if (type == "count") { outputFileName = "pick.count.table"; } + else if (type == "count") { outputFileName = "pick.count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -538,6 +539,14 @@ int RemoveSeqsCommand::readCount(){ } in.close(); out.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(outputFileName)) { + ct.readTable(outputFileName); + ct.printTable(outputFileName); + } + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 6a9a613..312f475 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -8,14 +8,15 @@ */ #include "screenseqscommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector ScreenSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(palignreport); CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); @@ -44,8 +45,8 @@ vector ScreenSeqsCommand::setParameters(){ string ScreenSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The screen.seqs command reads a fastafile and creates .....\n"; - helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; + helpString += "The screen.seqs command reads a fastafile and screens sequences.\n"; + helpString += "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, group, count, qfile, alignreport, taxonomy, optimize, criteria and processors.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The alignreport and taxonomy parameters allow you to remove bad seqs from taxonomy and alignreport files.\n"; helpString += "The start parameter is used to set a position the \"good\" sequences must start by. The default is -1.\n"; @@ -83,6 +84,7 @@ string ScreenSeqsCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "good" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "good" + m->getExtension(inputName); } else if (type == "accnos") { outputFileName = "bad.accnos"; } else if (type == "qfile") { outputFileName = "good" + m->getExtension(inputName); } @@ -110,6 +112,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand"); @@ -149,6 +152,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { outputTypes["accnos"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -202,6 +206,14 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -229,6 +241,19 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + alignreport = validParameter.validFile(parameters, "alignreport", true); if (alignreport == "not open") { abort = true; } else if (alignreport == "not found") { alignreport = ""; } @@ -288,10 +313,12 @@ ScreenSeqsCommand::ScreenSeqsCommand(string option) { temp = validParameter.validFile(parameters, "criteria", false); if (temp == "not found"){ temp = "90"; } m->mothurConvert(temp, criteria); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -312,6 +339,11 @@ int ScreenSeqsCommand::execute(){ if (optimize.size() != 0) { //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here //use the namefile to optimize correctly if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } getSummary(positions); } else { @@ -472,7 +504,9 @@ int ScreenSeqsCommand::execute(){ screenNameGroupFile(badSeqNames); if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } }else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the group - + else if (countfile != "") { screenCountFile(badSeqNames); } + + if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; } if(alignreport != "") { screenAlignReport(badSeqNames); } @@ -519,6 +553,11 @@ int ScreenSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -962,7 +1001,69 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ exit(1); } } +//*************************************************************************************************************** +int ScreenSeqsCommand::screenCountFile(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + + string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + it = badSeqNames.find(name); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + } + else{ + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + //we were unable to remove some of the bad sequences + if (badSeqNames.size() != 0) { + for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) { + m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct."); + m->mothurOutEndLine(); + } + } + + in.close(); + goodCountOut.close(); + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ScreenSeqsCommand", "screenCountFile"); + exit(1); + } +} //*************************************************************************************************************** int ScreenSeqsCommand::screenAlignReport(set badSeqNames){ diff --git a/screenseqscommand.h b/screenseqscommand.h index 771113d..b0d7c7c 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -44,6 +44,7 @@ private: int screenNameGroupFile(set); int screenGroupFile(set); + int screenCountFile(set); int screenAlignReport(set); int screenQual(set); int screenTaxonomy(set); @@ -56,7 +57,7 @@ private: #endif bool abort; - string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy; + string fastafile, namefile, groupfile, alignreport, outputDir, qualfile, taxonomy, countfile; int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, processors, criteria; vector outputNames; vector optimize; diff --git a/splitgroupscommand.cpp b/splitgroupscommand.cpp index af3ca66..f3c6cd9 100644 --- a/splitgroupscommand.cpp +++ b/splitgroupscommand.cpp @@ -10,13 +10,15 @@ #include "splitgroupscommand.h" #include "sharedutilities.h" #include "sequenceparser.h" +#include "counttable.h" //********************************************************************************************************************** vector SplitGroupCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "CountGroup", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "CountGroup", "none",false,false); parameters.push_back(pgroup); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -34,9 +36,9 @@ vector SplitGroupCommand::setParameters(){ string SplitGroupCommand::getHelpString(){ try { string helpString = ""; - helpString += "The split.group command reads a group file, and parses your fasta and names files by groups. \n"; - helpString += "The split.group command parameters are fasta, name, group and groups.\n"; - helpString += "The fasta and group parameters are required.\n"; + helpString += "The split.group command reads a group or count file, and parses your fasta and names or count files by groups. \n"; + helpString += "The split.group command parameters are fasta, name, group, count and groups.\n"; + helpString += "The fasta and group or count parameters are required.\n"; helpString += "The groups parameter allows you to select groups to create files for. \n"; helpString += "For example if you set groups=A-B-C, you will get a .A.fasta, .A.names, .B.fasta, .B.names, .C.fasta, .C.names files. \n"; helpString += "If you want .fasta and .names files for all groups, set groups=all. \n"; @@ -62,6 +64,7 @@ string SplitGroupCommand::getOutputFileNameTag(string type, string inputName="") else { if (type == "fasta") { outputFileName = "fasta"; } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -79,6 +82,7 @@ SplitGroupCommand::SplitGroupCommand(){ vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "SplitGroupCommand", "SplitGroupCommand"); @@ -112,6 +116,7 @@ SplitGroupCommand::SplitGroupCommand(string option) { vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -141,7 +146,14 @@ SplitGroupCommand::SplitGroupCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } - + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -160,23 +172,56 @@ SplitGroupCommand::SplitGroupCommand(string option) { groupfile = validParameter.validFile(parameters, "group", true); if (groupfile == "not open") { groupfile = ""; abort = true; } - else if (groupfile == "not found") { - groupfile = m->getGroupFile(); - if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current groupfile and the group parameter is required."); m->mothurOutEndLine(); abort = true; } + else if (groupfile == "not found") { groupfile = ""; }else { m->setGroupFile(groupfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + if ((countfile != "") && (groupfile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } + + if ((countfile == "") && (groupfile == "")) { + if (namefile == "") { //check for count then group + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); + abort = true; + } + } + }else { //check for group + groupfile = m->getGroupFile(); + if (groupfile != "") { m->mothurOut("Using " + groupfile + " as input file for the group parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a count or group file."); m->mothurOutEndLine(); + abort = true; + } + } + } groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = ""; } else { m->splitAtDash(groups, Groups); } //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(groupfile); } + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + if (groupfile != "") { outputDir = m->hasPath(groupfile); } + else { outputDir = m->hasPath(countfile); } + } - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -191,13 +236,48 @@ int SplitGroupCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - SequenceParser* parser; + if (countfile == "" ) { runNameGroup(); } + else { runCount(); } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SplitGroupCommand::runNameGroup(){ + try { + SequenceParser* parser; if (namefile == "") { parser = new SequenceParser(groupfile, fastafile); } else { parser = new SequenceParser(groupfile, fastafile, namefile); } if (m->control_pressed) { delete parser; return 0; } - + vector namesGroups = parser->getNamesOfGroups(); SharedUtil util; util.setGroups(Groups, namesGroups); @@ -215,7 +295,7 @@ int SplitGroupCommand::execute(){ parser->getSeqs(Groups[i], newFasta, false); outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta); if (m->control_pressed) { delete parser; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - + if (namefile != "") { parser->getNameMap(Groups[i], newName); outputNames.push_back(newName); outputTypes["name"].push_back(newName); @@ -225,29 +305,77 @@ int SplitGroupCommand::execute(){ } delete parser; - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - string current = ""; - itTypes = outputTypes.find("fasta"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } - } - - itTypes = outputTypes.find("name"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } - } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SplitGroupCommand", "runNameGroup"); + exit(1); } +} +//********************************************************************************************************************** +int SplitGroupCommand::runCount(){ + try { + + CountTable ct; + ct.readTable(countfile); + if (!ct.hasGroupInfo()) { m->mothurOut("[ERROR]: your count file does not contain group info, cannot split by group.\n"); m->control_pressed = true; } + + if (m->control_pressed) { return 0; } + + vector namesGroups = ct.getNamesOfGroups(); + SharedUtil util; util.setGroups(Groups, namesGroups); + + //fill filehandles with neccessary ofstreams + map ffiles; + map cfiles; + ofstream* temp; + for (int i=0; igetRootName(m->getSimpleName(fastafile)) + Groups[i] + "." + getOutputFileNameTag("fasta"); + outputNames.push_back(newFasta); outputTypes["fasta"].push_back(newFasta); + m->openOutputFile(newFasta, (*temp)); + temp = new ofstream; + cfiles[Groups[i]] = temp; + string newCount = outputDir + m->getRootName(m->getSimpleName(countfile)) + Groups[i] + "." + getOutputFileNameTag("count"); + m->openOutputFile(newCount, (*temp)); + outputNames.push_back(newCount); outputTypes["count"].push_back(newCount); + (*temp) << "Representative_Sequence\ttotal\t" << Groups[i] << endl; + } + + ifstream in; + m->openInputFile(fastafile, in); + + while (!in.eof()) { + Sequence seq(in); m->gobble(in); + + if (m->control_pressed) { break; } + if (seq.getName() != "") { + vector thisSeqsGroups = ct.getGroups(seq.getName()); + for (int i = 0; i < thisSeqsGroups.size(); i++) { + if (m->inUsersGroups(thisSeqsGroups[i], Groups)) { //if this sequence belongs to a group we want them print + seq.printSequence(*(ffiles[thisSeqsGroups[i]])); + int numSeqs = ct.getGroupCount(seq.getName(), Groups[i]); + (*(cfiles[thisSeqsGroups[i]])) << seq.getName() << '\t' << numSeqs << '\t' << numSeqs << endl; + } + } + } + } + in.close(); + + //close and delete ofstreams + for (int i=0; ierrorOut(e, "SplitGroupCommand", "execute"); + m->errorOut(e, "SplitGroupCommand", "runCount"); exit(1); } } diff --git a/splitgroupscommand.h b/splitgroupscommand.h index a8dc9a1..62e063d 100644 --- a/splitgroupscommand.h +++ b/splitgroupscommand.h @@ -42,9 +42,12 @@ public: private: vector outputNames; - string outputDir, namefile, groupfile, groups, fastafile; + string outputDir, namefile, groupfile, countfile, groups, fastafile; vector Groups; bool abort; + + int runNameGroup(); + int runCount(); }; /***************************************************************************************/ diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp index fb4887c..bba6289 100644 --- a/treegroupscommand.cpp +++ b/treegroupscommand.cpp @@ -16,8 +16,9 @@ vector TreeGroupCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); - CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "countcolumn",false,false); parameters.push_back(pcount); + CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName-countcolumn",false,false); parameters.push_back(pcolumn); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); @@ -160,6 +161,14 @@ TreeGroupCommand::TreeGroupCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -182,6 +191,11 @@ TreeGroupCommand::TreeGroupCommand(string option) { if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } if ((phylipfile == "") && (columnfile == "") && (sharedfile == "")) { //is there are current file available for either of these? @@ -204,15 +218,20 @@ TreeGroupCommand::TreeGroupCommand(string option) { else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When running the tree.shared command with a distance file you may not use both the column and the phylip parameters."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { - if (namefile == "") { + if ((namefile == "") && (countfile == "")){ namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -433,17 +452,22 @@ int TreeGroupCommand::execute(){ readMatrix->setCutoff(cutoff); - if(namefile != ""){ - nameMap = new NameAssignment(namefile); - nameMap->readMap(); - } - else{ nameMap = NULL; } - - readMatrix->read(nameMap); + ct = NULL; + if(namefile != ""){ + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + readMatrix->read(nameMap); + }else if (countfile != "") { + ct = new CountTable(); + ct->readTable(countfile); + readMatrix->read(ct); + } + list = readMatrix->getListVector(); SparseDistanceMatrix* dMatrix = readMatrix->getDMatrix(); //make treemap + if (ct != NULL) { delete ct; } ct = new CountTable(); set nameMap; map groupMap; @@ -461,7 +485,7 @@ int TreeGroupCommand::execute(){ //clear globaldatas old tree names if any m->Treenames.clear(); - + //fills globaldatas tree names m->Treenames = m->getGroups(); diff --git a/treegroupscommand.h b/treegroupscommand.h index 7342b36..b29670a 100644 --- a/treegroupscommand.h +++ b/treegroupscommand.h @@ -110,7 +110,7 @@ private: vector treeCalculators; vector lookup; string lastLabel; - string format, groupNames, filename, sharedfile, inputfile; + string format, groupNames, filename, sharedfile, countfile, inputfile; int numGroups, subsampleSize, iters, processors; ofstream out; float precision, cutoff; diff --git a/treemap.cpp b/treemap.cpp index 06d05d4..47b7cf3 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -246,26 +246,6 @@ string TreeMap::getGroup(string sequenceName) { return "not found"; } -} -/************************************************************ -void TreeMap::setIndex(string seq, int index) { - it = treemap.find(seq); - if (it != treemap.end()) { //sequence name was in group file - treemap[seq].vectorIndex = index; - }else { - treemap[seq].vectorIndex = index; - treemap[seq].groupname = "not found"; - } -} -/************************************************************ -int TreeMap::getIndex(string seq) { - - it = treemap.find(seq); - // if it is a valid sequence name then return index - if (it != treemap.end()) { return treemap[seq].vectorIndex; } - // if not return error code - else { return -1; } - } /************************************************************/ diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 29b8733..bbb0b36 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -114,7 +114,7 @@ string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "fasta") { outputFileName = "fasta"; } else if (type == "group") { outputFileName = "groups"; } else if (type == "name") { outputFileName = "names"; } - else if (type == "count") { outputFileName = "count.table"; } + else if (type == "count") { outputFileName = "count_table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName;