From ea4f373c28543cd1002b0dd7dc6e55c526647d59 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Fri, 14 Sep 2012 15:17:37 -0400 Subject: [PATCH] added count file to trim.seqs, get.groups, get.lineage, get.seqs, heatmap.sim, list.seqs, remove.groups, remove.lineage, remove.seqs, align.check, summary.seqs, summary.qual. --- classify.cpp | 6 +- counttable.cpp | 16 +++ counttable.h | 1 + getgroupscommand.cpp | 2 +- getlineagecommand.cpp | 115 +++++++++++++--- getlineagecommand.h | 3 +- getseqscommand.cpp | 107 +++++++++++++-- getseqscommand.h | 3 +- heatmapsimcommand.cpp | 93 ++++++++----- heatmapsimcommand.h | 2 +- listseqscommand.cpp | 47 +++++-- listseqscommand.h | 3 +- makefile | 2 +- removegroupscommand.cpp | 2 +- removelineagecommand.cpp | 104 ++++++++++++-- removelineagecommand.h | 3 +- removeseqscommand.cpp | 113 ++++++++++++++-- removeseqscommand.h | 3 +- secondarystructurecommand.cpp | 40 ++++-- secondarystructurecommand.h | 2 +- seqsummarycommand.cpp | 59 +++++--- seqsummarycommand.h | 12 +- summaryqualcommand.cpp | 52 +++++-- summaryqualcommand.h | 11 +- trimseqscommand.cpp | 247 +++++++++++++++++++++++++--------- trimseqscommand.h | 62 ++++++--- 26 files changed, 890 insertions(+), 220 deletions(-) diff --git a/classify.cpp b/classify.cpp index ace89b9..459c90f 100644 --- a/classify.cpp +++ b/classify.cpp @@ -61,7 +61,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me names.push_back(temp.getName()); database->addSequence(temp); } - database->generateDB(); + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); database->readKmerDB(kmerFileTest); @@ -200,7 +201,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } fastaFile.close(); - database->generateDB(); + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); diff --git a/counttable.cpp b/counttable.cpp index bc9d4da..cd623ec 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -661,6 +661,22 @@ vector CountTable::getNamesOfSeqs() { } } /************************************************************/ +//returns the names of all unique sequences in file mapped to their seqCounts +map CountTable::getNameMap() { + try { + map names; + for (map::iterator it = indexNameMap.begin(); it != indexNameMap.end(); it++) { + names[it->first] = totals[it->second]; + } + + return names; + } + catch(exception& e) { + m->errorOut(e, "CountTable", "getNameMap"); + exit(1); + } +} +/************************************************************/ //returns the names of all unique sequences in file vector CountTable::getNamesOfSeqs(string group) { try { diff --git a/counttable.h b/counttable.h index 68ba8d2..b66c4c6 100644 --- a/counttable.h +++ b/counttable.h @@ -83,6 +83,7 @@ class CountTable { vector getNamesOfSeqs(string); int mergeCounts(string, string); //combines counts for 2 seqs, saving under the first name passed in. ListVector getListVector(); + map getNameMap(); private: string filename; diff --git a/getgroupscommand.cpp b/getgroupscommand.cpp index 7585c12..910a872 100644 --- a/getgroupscommand.cpp +++ b/getgroupscommand.cpp @@ -72,7 +72,7 @@ string GetGroupsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count.table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } diff --git a/getlineagecommand.cpp b/getlineagecommand.cpp index 1aba0fe..1cd139b 100644 --- a/getlineagecommand.cpp +++ b/getlineagecommand.cpp @@ -15,8 +15,9 @@ vector GetLineageCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -38,9 +39,9 @@ vector GetLineageCommand::setParameters(){ string GetLineageCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"; + helpString += "The get.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n"; helpString += "It outputs a file containing only the sequences from the taxonomy file that are from the taxon requested.\n"; - helpString += "The get.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; + helpString += "The get.lineage command parameters are taxon, fasta, name, group, count, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; helpString += "The taxon parameter allows you to select the taxons you would like to get and is required.\n"; helpString += "You may enter your taxons with confidence scores, doing so will get only those sequences that belong to the taxonomy and whose cofidence scores is above the scores you give.\n"; @@ -70,6 +71,7 @@ string GetLineageCommand::getOutputFileNameTag(string type, string inputName="") if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count.table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } @@ -94,6 +96,7 @@ GetLineageCommand::GetLineageCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetLineageCommand", "GetLineageCommand"); @@ -131,6 +134,7 @@ GetLineageCommand::GetLineageCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -187,6 +191,14 @@ GetLineageCommand::GetLineageCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -230,6 +242,19 @@ GetLineageCommand::GetLineageCommand(string option) { else { temp = "false"; usedDups = ""; } } dups = m->isTrue(temp); + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } taxons = validParameter.validFile(parameters, "taxon", false); if (taxons == "not found") { taxons = ""; m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine(); abort = true; } @@ -240,12 +265,14 @@ GetLineageCommand::GetLineageCommand(string option) { } m->splitAtChar(taxons, listOfTaxons, '-'); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -262,11 +289,18 @@ int GetLineageCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (taxfile != "") { readTax(); } //fills the set of names to get if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } + if (countfile != "") { readCount(); } if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } @@ -305,7 +339,12 @@ int GetLineageCommand::execute(){ itTypes = outputTypes.find("taxonomy"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } - } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -353,7 +392,7 @@ int GetLineageCommand::readFasta(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); return 0; @@ -365,6 +404,52 @@ int GetLineageCommand::readFasta(){ } } //********************************************************************************************************************** +int GetLineageCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) != 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + } + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetLineageCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int GetLineageCommand::readList(){ try { string thisOutputDir = outputDir; @@ -425,7 +510,7 @@ int GetLineageCommand::readList(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName); return 0; @@ -510,7 +595,7 @@ int GetLineageCommand::readName(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["name"].push_back(outputFileName); return 0; @@ -558,7 +643,7 @@ int GetLineageCommand::readGroup(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["group"].push_back(outputFileName); return 0; @@ -814,7 +899,7 @@ int GetLineageCommand::readAlign(){ in.close(); out.close(); - if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + if (wroteSomething == false) { m->mothurOut("Your file contains does not contain any sequences from " + taxons + "."); m->mothurOutEndLine(); } outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName); return 0; diff --git a/getlineagecommand.h b/getlineagecommand.h index 0ab042b..99bc0fa 100644 --- a/getlineagecommand.h +++ b/getlineagecommand.h @@ -36,11 +36,12 @@ class GetLineageCommand : public Command { private: set names; vector outputNames, listOfTaxons; - string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + string fastafile, namefile, groupfile, alignfile, countfile, listfile, taxfile, outputDir, taxons; bool abort, dups; int readFasta(); int readName(); + int readCount(); int readGroup(); int readAlign(); int readList(); diff --git a/getseqscommand.cpp b/getseqscommand.cpp index ccabafb..e0faef4 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -15,8 +15,9 @@ vector GetSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -40,7 +41,7 @@ vector GetSeqsCommand::setParameters(){ string GetSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"; + helpString += "The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n"; helpString += "It outputs a file containing only the sequences in the .accnos file.\n"; helpString += "The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos unless you have a valid current accnos file, and at least one of the other parameters.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; @@ -68,6 +69,7 @@ GetSeqsCommand::GetSeqsCommand(){ outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["accnosreport"] = tempOutNames; } catch(exception& e) { @@ -88,6 +90,7 @@ string GetSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ if (type == "fasta") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count.table"; } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } @@ -135,6 +138,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["accnosreport"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -215,6 +219,14 @@ GetSeqsCommand::GetSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["qfile"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -270,17 +282,32 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (accnosfile2 == "not open") { abort = true; } else if (accnosfile2 == "not found") { accnosfile2 = ""; } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "true"; usedDups = ""; } dups = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } - - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } } @@ -300,11 +327,18 @@ int GetSeqsCommand::execute(){ names = m->readAccnos(accnosfile); if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (namefile != "") { readName(); } if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } + if (countfile != "") { readCount(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } @@ -354,6 +388,10 @@ int GetSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -493,6 +531,57 @@ int GetSeqsCommand::readQual(){ exit(1); } } +//********************************************************************************************************************** +int GetSeqsCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int selectedCount = 0; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) != 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + selectedCount+= thisTotal; + } + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file does not contain any sequence from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Selected " + toString(selectedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "GetSeqsCommand", "readCount"); + exit(1); + } +} + //********************************************************************************************************************** int GetSeqsCommand::readList(){ try { diff --git a/getseqscommand.h b/getseqscommand.h index c71b5f2..60e471e 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -35,7 +35,7 @@ class GetSeqsCommand : public Command { private: set names; vector outputNames; - string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; + string accnosfile, accnosfile2, fastafile, namefile, countfile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; //for debug @@ -44,6 +44,7 @@ class GetSeqsCommand : public Command { int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/heatmapsimcommand.cpp b/heatmapsimcommand.cpp index 3de10e6..8a4a12b 100644 --- a/heatmapsimcommand.cpp +++ b/heatmapsimcommand.cpp @@ -25,7 +25,8 @@ vector HeatMapSimCommand::setParameters(){ try { CommandParameter pshared("shared", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pshared); CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "none",false,false); parameters.push_back(pphylip); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnShared", "PhylipColumnShared", "ColumnName",false,false); parameters.push_back(pcolumn); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); @@ -48,9 +49,8 @@ string HeatMapSimCommand::getHelpString(){ try { string helpString = ""; ValidCalculators validCalculator; - helpString += "The heatmap.sim command parameters are shared, phylip, column, name, groups, calc, fontsize and label. shared or phylip or column and name are required unless valid current files exist.\n"; - helpString += "There are two ways to use the heatmap.sim command. The first is with the read.otu command. \n"; - helpString += "With the read.otu command you may use the groups, label and calc parameters. \n"; + helpString += "The heatmap.sim command parameters are shared, phylip, column, name, count, groups, calc, fontsize and label. shared or phylip or column and name are required unless valid current files exist.\n"; + helpString += "There are two ways to use the heatmap.sim command. The first is with a shared file, and you may use the groups, label and calc parameter. \n"; helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included in your heatmap.\n"; helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like a heatmap created for, and is also separated by dashes.\n"; helpString += "The fontsize parameter allows you to adjust the font size of the picture created, default=24.\n"; @@ -174,6 +174,14 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["shared"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //required parameters @@ -197,6 +205,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //error checking on files if ((sharedfile == "") && ((phylipfile == "") && (columnfile == ""))) { @@ -224,8 +238,12 @@ HeatMapSimCommand::HeatMapSimCommand(string option) { namefile = m->getNameFile(); if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } else { - m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); - abort = true; + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("You need to provide a name or count file if you are going to use the column format."); m->mothurOutEndLine(); + abort = true; + } } } } @@ -520,20 +538,28 @@ int HeatMapSimCommand::runCommandDist() { in.close(); }else { //read names file - NameAssignment* nameMap = new NameAssignment(namefile); - nameMap->readMap(); - - //put names in order in vector - for (int i = 0; i < nameMap->size(); i++) { - names.push_back(nameMap->get(i)); - } - - //resize matrix - matrix.resize(nameMap->size()); - for (int i = 0; i < nameMap->size(); i++) { - matrix[i].resize(nameMap->size(), 0.0); - } - + NameAssignment* nameMap; + CountTable ct; + if (namefile != "") { + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + + //put names in order in vector + for (int i = 0; i < nameMap->size(); i++) { + names.push_back(nameMap->get(i)); + } + }else if (countfile != "") { + nameMap = NULL; + ct.readTable(countfile); + names = ct.getNamesOfSeqs(); + } + + //resize matrix + matrix.resize(names.size()); + for (int i = 0; i < names.size(); i++) { + matrix[i].resize(names.size(), 0.0); + } + //read column file string first, second; double dist; @@ -544,19 +570,26 @@ int HeatMapSimCommand::runCommandDist() { if (m->control_pressed) { return 0; } - map::iterator itA = nameMap->find(first); - map::iterator itB = nameMap->find(second); - - if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1); } - if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1); } - - //save distance - matrix[itA->second][itB->second] = dist; - matrix[itB->second][itA->second] = dist; + if (namefile != "") { + map::iterator itA = nameMap->find(first); + map::iterator itB = nameMap->find(second); + + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + first + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + second + "' was not found in the names file, please correct\n"); exit(1); } + + //save distance + matrix[itA->second][itB->second] = dist; + matrix[itB->second][itA->second] = dist; + }else if (countfile != "") { + int itA = ct.get(first); + int itB = ct.get(second); + matrix[itA][itB] = dist; + matrix[itB][itA] = dist; + } } in.close(); - delete nameMap; + if (namefile != "") { delete nameMap; } } diff --git a/heatmapsimcommand.h b/heatmapsimcommand.h index 7b74880..2c3a470 100644 --- a/heatmapsimcommand.h +++ b/heatmapsimcommand.h @@ -43,7 +43,7 @@ private: OptionParser* parser; bool abort, allLines; set labels; //holds labels to be used - string format, groups, label, calc, sharedfile, phylipfile, columnfile, namefile, outputDir, inputfile; + string format, groups, label, calc, sharedfile, phylipfile, columnfile, countfile, namefile, outputDir, inputfile; vector Estimators, Groups, outputNames; int fontsize; diff --git a/listseqscommand.cpp b/listseqscommand.cpp index bfbb078..7c3f07f 100644 --- a/listseqscommand.cpp +++ b/listseqscommand.cpp @@ -10,6 +10,7 @@ #include "listseqscommand.h" #include "sequence.hpp" #include "listvector.hpp" +#include "counttable.h" //********************************************************************************************************************** @@ -17,6 +18,7 @@ vector ListSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pfasta); CommandParameter pname("name", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pcount); CommandParameter pgroup("group", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "FNGLT", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); @@ -37,8 +39,8 @@ vector ListSeqsCommand::setParameters(){ string ListSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The list.seqs command reads a fasta, name, group, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n"; - helpString += "The list.seqs command parameters are fasta, name, group, list, taxonomy and alignreport. You must provide one of these parameters.\n"; + helpString += "The list.seqs command reads a fasta, name, group, count, list, taxonomy or alignreport file and outputs a .accnos file containing sequence names.\n"; + helpString += "The list.seqs command parameters are fasta, name, group, count, list, taxonomy and alignreport. You must provide one of these parameters.\n"; helpString += "The list.seqs command should be in the following format: list.seqs(fasta=yourFasta).\n"; helpString += "Example list.seqs(fasta=amazon.fasta).\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; @@ -164,6 +166,14 @@ ListSeqsCommand::ListSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -195,8 +205,13 @@ ListSeqsCommand::ListSeqsCommand(string option) { if (taxfile == "not open") { abort = true; } else if (taxfile == "not found") { taxfile = ""; } else { m->setTaxonomyFile(taxfile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } - if ((fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == "")) { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; } + if ((countfile == "") && (fastafile == "") && (namefile == "") && (listfile == "") && (groupfile == "") && (alignfile == "") && (taxfile == "")) { m->mothurOut("You must provide a file."); m->mothurOutEndLine(); abort = true; } int okay = 1; if (outputDir != "") { okay++; } @@ -225,6 +240,7 @@ int ListSeqsCommand::execute(){ else if (alignfile != "") { inputFileName = alignfile; readAlign(); } else if (listfile != "") { inputFileName = listfile; readList(); } else if (taxfile != "") { inputFileName = taxfile; readTax(); } + else if (countfile != "") { inputFileName = countfile; readCount(); } if (m->control_pressed) { outputTypes.clear(); return 0; } @@ -293,12 +309,6 @@ int ListSeqsCommand::readFasta(){ Sequence currSeq(in); name = currSeq.getName(); - //if (lastName == "") { lastName = name; } - //if (name != lastName) { count = 1; } - // lastName = name; - - //Sequence newSeq(name+"_"+toString(count), currSeq.getAligned()); - //newSeq.printSequence(out); if (name != "") { names.push_back(name); } @@ -404,7 +414,24 @@ int ListSeqsCommand::readGroup(){ exit(1); } } - +//********************************************************************************************************************** +int ListSeqsCommand::readCount(){ + try { + CountTable ct; + ct.readTable(countfile); + + if (m->control_pressed) { return 0; } + + names = ct.getNamesOfSeqs(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ListSeqsCommand", "readCount"); + exit(1); + } +} //********************************************************************************************************************** //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name int ListSeqsCommand::readAlign(){ diff --git a/listseqscommand.h b/listseqscommand.h index 1a31a6d..8e4cce3 100644 --- a/listseqscommand.h +++ b/listseqscommand.h @@ -34,7 +34,7 @@ class ListSeqsCommand : public Command { private: vector names, outputNames; - string fastafile, namefile, groupfile, alignfile, inputFileName, outputDir, listfile, taxfile; + string fastafile, namefile, groupfile, countfile, alignfile, inputFileName, outputDir, listfile, taxfile; bool abort; int readFasta(); @@ -43,6 +43,7 @@ class ListSeqsCommand : public Command { int readAlign(); int readList(); int readTax(); + int readCount(); }; diff --git a/makefile b/makefile index 32ede6e..bc5a569 100644 --- a/makefile +++ b/makefile @@ -17,7 +17,7 @@ USECOMPRESSION ?= no MOTHUR_FILES="\"Enter_your_default_path_here\"" RELEASE_DATE = "\"7/9/2012\"" VERSION = "\"1.26.0\"" -FORTAN_COMPILER = gfortran +FORTAN_COMPILER = /usr/local/gfortran/bin/gfortran FORTRAN_FLAGS = # Optimize to level 3: diff --git a/removegroupscommand.cpp b/removegroupscommand.cpp index 86ddf94..80aeb3f 100644 --- a/removegroupscommand.cpp +++ b/removegroupscommand.cpp @@ -71,7 +71,7 @@ string RemoveGroupsCommand::getOutputFileNameTag(string type, string inputName=" else if (type == "taxonomy") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count.table"; } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "shared") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "design") { outputFileName = "pick" + m->getExtension(inputName); } diff --git a/removelineagecommand.cpp b/removelineagecommand.cpp index 4cec90f..a3fd1b8 100644 --- a/removelineagecommand.cpp +++ b/removelineagecommand.cpp @@ -15,8 +15,9 @@ vector RemoveLineageCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,true); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -38,9 +39,9 @@ vector RemoveLineageCommand::setParameters(){ string RemoveLineageCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n"; + helpString += "The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, count, list or alignreport file.\n"; helpString += "It outputs a file containing only the sequences from the taxonomy file that are not from the taxon you requested to be removed.\n"; - helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; + helpString += "The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, count, alignreport and dups. You must provide taxonomy unless you have a valid current taxonomy file.\n"; helpString += "The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"; helpString += "The taxon parameter allows you to select the taxons you would like to remove, and is required.\n"; helpString += "You may enter your taxons with confidence scores, doing so will remove only those sequences that belong to the taxonomy and whose cofidence scores fall below the scores you give.\n"; @@ -72,6 +73,7 @@ string RemoveLineageCommand::getOutputFileNameTag(string type, string inputName= else if (type == "name") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "group") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } + else if (type == "count") { outputFileName = "pick.count.table"; } else if (type == "alignreport") { outputFileName = "pick.align.report"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } @@ -94,6 +96,7 @@ RemoveLineageCommand::RemoveLineageCommand(){ outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand"); @@ -131,6 +134,7 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -187,6 +191,14 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -223,6 +235,19 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { else { m->mothurOut("You have no current taxonomy file and the taxonomy parameter is required."); m->mothurOutEndLine(); abort = true; } }else { m->setTaxonomyFile(taxfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + string usedDups = "true"; string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { @@ -240,14 +265,16 @@ RemoveLineageCommand::RemoveLineageCommand(string option) { } m->splitAtChar(taxons, listOfTaxons, '-'); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (countfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, count, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } - if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ - vector files; files.push_back(fastafile); files.push_back(taxfile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && ((fastafile != "") || (taxfile != ""))){ + vector files; files.push_back(fastafile); files.push_back(taxfile); + parser.getNameFile(files); + } + } } @@ -265,6 +292,12 @@ int RemoveLineageCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (taxfile != "") { readTax(); } //fills the set of names to remove @@ -273,6 +306,7 @@ int RemoveLineageCommand::execute(){ if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } + if (countfile != "") { readCount(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -309,6 +343,11 @@ int RemoveLineageCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -511,7 +550,52 @@ int RemoveLineageCommand::readName(){ exit(1); } } - +//********************************************************************************************************************** +int RemoveLineageCommand::readCount(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) == 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + } + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveLineageCommand", "readCount"); + exit(1); + } +} //********************************************************************************************************************** int RemoveLineageCommand::readGroup(){ try { diff --git a/removelineagecommand.h b/removelineagecommand.h index a5caec8..a756d24 100644 --- a/removelineagecommand.h +++ b/removelineagecommand.h @@ -34,12 +34,13 @@ class RemoveLineageCommand : public Command { private: set names; vector outputNames, listOfTaxons; - string fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir, taxons; + string fastafile, namefile, groupfile, alignfile, listfile, countfile, taxfile, outputDir, taxons; bool abort, dups; int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index 0d53c1a..1fb9446 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -15,8 +15,9 @@ vector RemoveSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "FNGLT", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "FNGLT", "none",false,false); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "FNGLT", "none",false,false); parameters.push_back(pgroup); CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist); CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy); CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport); @@ -39,9 +40,9 @@ vector RemoveSeqsCommand::setParameters(){ string RemoveSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n"; + helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, count, list, taxonomy, quality or alignreport file.\n"; helpString += "It outputs a file containing the sequences NOT in the .accnos file.\n"; - helpString += "The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"; + helpString += "The remove.seqs command parameters are accnos, fasta, name, group, count, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"; helpString += "The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=true. \n"; helpString += "The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n"; helpString += "Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"; @@ -70,6 +71,7 @@ string RemoveSeqsCommand::getOutputFileNameTag(string type, string inputName="") else if (type == "list") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "qfile") { outputFileName = "pick" + m->getExtension(inputName); } else if (type == "alignreport") { outputFileName = "pick.align.report"; } + else if (type == "count") { outputFileName = "pick.count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -93,6 +95,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(){ outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand"); @@ -131,6 +134,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -203,6 +207,14 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["qfile"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -259,13 +271,28 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option) { else { temp = "false"; usedDups = ""; } } dups = m->isTrue(temp); - - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } - - if ((fastafile != "") && (namefile == "")) { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + + if ((countfile == "") && (fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; } + + if (countfile == "") { + if ((fastafile != "") && (namefile == "")) { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -285,6 +312,12 @@ int RemoveSeqsCommand::execute(){ names = m->readAccnos(accnosfile); if (m->control_pressed) { return 0; } + + if (countfile != "") { + if ((fastafile != "") || (listfile != "") || (taxfile != "")) { + m->mothurOut("\n[NOTE]: The count file should contain only unique names, so mothur assumes your fasta, list and taxonomy files also contain only uniques.\n\n"); + } + } //read through the correct file and output lines you want to keep if (namefile != "") { readName(); } @@ -294,6 +327,7 @@ int RemoveSeqsCommand::execute(){ if (listfile != "") { readList(); } if (taxfile != "") { readTax(); } if (qualfile != "") { readQual(); } + if (countfile != "") { readCount(); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -333,7 +367,12 @@ int RemoveSeqsCommand::execute(){ itTypes = outputTypes.find("qfile"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } - } + } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } } return 0; @@ -463,6 +502,56 @@ int RemoveSeqsCommand::readQual(){ } } //********************************************************************************************************************** +int RemoveSeqsCommand::readCount(){ + try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(countfile, in); + + bool wroteSomething = false; + int removedCount = 0; + + string headers = m->getline(in); m->gobble(in); + out << headers << endl; + + string name, rest; int thisTotal; + while (!in.eof()) { + + if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + if (m->debug) { m->mothurOut("[DEBUG]: " + name + '\t' + rest + "\n"); } + + if (names.count(name) == 0) { + out << name << '\t' << thisTotal << '\t' << rest << endl; + wroteSomething = true; + }else { removedCount += thisTotal; } + } + in.close(); + out.close(); + + if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + outputTypes["count"].push_back(outputFileName); outputNames.push_back(outputFileName); + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RemoveSeqsCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** int RemoveSeqsCommand::readList(){ try { string thisOutputDir = outputDir; diff --git a/removeseqscommand.h b/removeseqscommand.h index 474951a..151b413 100644 --- a/removeseqscommand.h +++ b/removeseqscommand.h @@ -34,13 +34,14 @@ class RemoveSeqsCommand : public Command { private: set names; - string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; + string accnosfile, fastafile, namefile, groupfile, countfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; vector outputNames; int readFasta(); int readName(); int readGroup(); + int readCount(); int readAlign(); int readList(); int readTax(); diff --git a/secondarystructurecommand.cpp b/secondarystructurecommand.cpp index 869df02..4d04270 100644 --- a/secondarystructurecommand.cpp +++ b/secondarystructurecommand.cpp @@ -9,13 +9,15 @@ #include "secondarystructurecommand.h" #include "sequence.hpp" +#include "counttable.h" //********************************************************************************************************************** vector AlignCheckCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pmap("map", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pmap); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -32,7 +34,7 @@ vector AlignCheckCommand::setParameters(){ string AlignCheckCommand::getHelpString(){ try { string helpString = ""; - helpString += "The align.check command reads a fasta file and map file as well as an optional name file.\n"; + helpString += "The align.check command reads a fasta file and map file as well as an optional name or count file.\n"; helpString += "It outputs a file containing the secondary structure matches in the .align.check file.\n"; helpString += "The align.check command parameters are fasta and map, both are required.\n"; helpString += "The align.check command should be in the following format: align.check(fasta=yourFasta, map=yourMap).\n"; @@ -135,6 +137,14 @@ AlignCheckCommand::AlignCheckCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //check for required parameters @@ -155,16 +165,25 @@ AlignCheckCommand::AlignCheckCommand(string option) { else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it } - if ((namefile == "") && (fastafile != "")){ - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if ((namefile == "") && (fastafile != "")){ + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -184,6 +203,11 @@ int AlignCheckCommand::execute(){ readMap(); if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } if (m->control_pressed) { return 0; } @@ -217,7 +241,7 @@ int AlignCheckCommand::execute(){ if (haderror == 1) { m->control_pressed = true; break; } int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(seq.getName()); @@ -274,7 +298,7 @@ int AlignCheckCommand::execute(){ m->mothurOut("75%-tile:\t" + toString(pound[ptile75]) + "\t" + toString(dash[ptile75]) + "\t" + toString(plus[ptile75]) + "\t" + toString(equal[ptile75]) + "\t" + toString(loop[ptile75]) + "\t" + toString(tilde[ptile75]) + "\t" + toString(total[ptile75])); m->mothurOutEndLine(); m->mothurOut("97.5%-tile:\t" + toString(pound[ptile97_5]) + "\t" + toString(dash[ptile97_5]) + "\t" + toString(plus[ptile97_5]) + "\t" + toString(equal[ptile97_5]) + "\t" + toString(loop[ptile97_5]) + "\t" + toString(tilde[ptile97_5]) + "\t" + toString(total[ptile97_5])); m->mothurOutEndLine(); m->mothurOut("Maximum:\t" + toString(pound[ptile100]) + "\t" + toString(dash[ptile100]) + "\t" + toString(plus[ptile100]) + "\t" + toString(equal[ptile100]) + "\t" + toString(loop[ptile100]) + "\t" + toString(tilde[ptile100]) + "\t" + toString(total[ptile100])); m->mothurOutEndLine(); - if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); } + if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(count)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(count)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(size)); m->mothurOutEndLine(); } diff --git a/secondarystructurecommand.h b/secondarystructurecommand.h index 110f019..becafc5 100644 --- a/secondarystructurecommand.h +++ b/secondarystructurecommand.h @@ -50,7 +50,7 @@ class AlignCheckCommand : public Command { private: vector structMap; - string mapfile, fastafile, outputDir, namefile; + string mapfile, fastafile, outputDir, namefile, countfile; bool abort; int seqLength, haderror; vector outputNames; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 830643d..a9bb573 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -8,13 +8,14 @@ */ #include "seqsummarycommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector SeqSummaryCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -33,8 +34,9 @@ string SeqSummaryCommand::getHelpString(){ try { string helpString = ""; helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n"; - helpString += "The summary.seqs command parameters are fasta, name and processors, fasta is required, unless you have a valid current fasta file.\n"; + helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n"; helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n"; + helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n"; helpString += "The summary.seqs command should be in the following format: \n"; helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n"; @@ -123,6 +125,14 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -142,6 +152,13 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -153,11 +170,12 @@ SeqSummaryCommand::SeqSummaryCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } - + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -186,6 +204,11 @@ int SeqSummaryCommand::execute(){ vector longHomoPolymer; if (namefile != "") { nameMap = m->readNames(namefile); } + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } if (m->control_pressed) { return 0; } @@ -344,7 +367,7 @@ int SeqSummaryCommand::execute(){ int size = startPosition.size(); //find means - float meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; + double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer; meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0; for (int i = 0; i < size; i++) { meanStartPosition += startPosition[i]; @@ -353,6 +376,7 @@ int SeqSummaryCommand::execute(){ meanAmbigBases += ambigBases[i]; meanLongHomoPolymer += longHomoPolymer[i]; } + //this is an int divide so the remainder is lost meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size; @@ -380,7 +404,7 @@ int SeqSummaryCommand::execute(){ m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine(); m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine(); - if (namefile == "") { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } + if ((namefile == "") && (countfile == "")) { m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); } else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); } if (m->control_pressed) { m->mothurRemove(summaryFile); return 0; } @@ -430,11 +454,11 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector::iterator it = nameMap.find(current.getName()); - if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } + if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; } else { num = it->second; } } @@ -505,11 +529,11 @@ int SeqSummaryCommand::MPICreateSummary(int start, int num, vector& startPo if (current.getName() != "") { int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(current.getName()); - if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your namefile, please correct." << endl; m->control_pressed = true; } + if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; } else { num = it->second; } } @@ -626,14 +650,17 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, vector pDataArray; DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; - + + bool hasNameMap = false; + if ((namefile !="") || (countfile != "")) { hasNameMap = true; } + //Create processor worker threads. for( int i=0; istart, lines[i]->end, namefile, nameMap); + seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap); pDataArray.push_back(tempSum); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. diff --git a/seqsummarycommand.h b/seqsummarycommand.h index 79e8be9..3926e25 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -34,7 +34,7 @@ public: void help() { m->mothurOut(getHelpString()); } private: bool abort; - string fastafile, outputDir, namefile; + string fastafile, outputDir, namefile, countfile; int processors; vector outputNames; map nameMap; @@ -74,18 +74,18 @@ struct seqSumData { unsigned long long end; int count; MothurOut* m; - string namefile; + bool hasNameMap; map nameMap; seqSumData(){} - seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map nam) { + seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, bool na, map nam) { filename = f; sumFile = sf; m = mout; start = st; end = en; - namefile = na; + hasNameMap = na; nameMap = nam; count = 0; } @@ -123,11 +123,11 @@ static DWORD WINAPI MySeqSumThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if (pDataArray->hasNameMap){ //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); - if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } + if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your name or count file, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; } else { num = it->second; } } diff --git a/summaryqualcommand.cpp b/summaryqualcommand.cpp index 5d79713..5a07367 100644 --- a/summaryqualcommand.cpp +++ b/summaryqualcommand.cpp @@ -8,13 +8,14 @@ */ #include "summaryqualcommand.h" - +#include "counttable.h" //********************************************************************************************************************** vector SummaryQualCommand::setParameters(){ try { CommandParameter pqual("qfile", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pqual); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -32,9 +33,10 @@ vector SummaryQualCommand::setParameters(){ string SummaryQualCommand::getHelpString(){ try { string helpString = ""; - helpString += "The summary.qual command reads a quality file and an optional name file, and summarizes the quality information.\n"; - helpString += "The summary.tax command parameters are qfile, name and processors. qfile is required, unless you have a valid current quality file.\n"; + helpString += "The summary.qual command reads a quality file and an optional name or count file, and summarizes the quality information.\n"; + helpString += "The summary.tax command parameters are qfile, name, count and processors. qfile is required, unless you have a valid current quality file.\n"; helpString += "The name parameter allows you to enter a name file associated with your quality file. \n"; + helpString += "The count parameter allows you to enter a count file associated with your quality file. \n"; helpString += "The summary.qual command should be in the following format: \n"; helpString += "summary.qual(qfile=yourQualityFile) \n"; helpString += "Note: No spaces between parameter labels (i.e. qfile), '=' and parameters (i.e.yourQualityFile).\n"; @@ -122,6 +124,14 @@ SummaryQualCommand::SummaryQualCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } //initialize outputTypes @@ -141,6 +151,13 @@ SummaryQualCommand::SummaryQualCommand(string option) { if (namefile == "not open") { namefile = ""; abort = true; } else if (namefile == "not found") { namefile = ""; } else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -152,10 +169,13 @@ SummaryQualCommand::SummaryQualCommand(string option) { m->setProcessors(temp); m->mothurConvert(temp, processors); - if (namefile == "") { - vector files; files.push_back(qualfile); - parser.getNameFile(files); - } + + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(qualfile); + parser.getNameFile(files); + } + } } } catch(exception& e) { @@ -179,7 +199,12 @@ int SummaryQualCommand::execute(){ if (m->control_pressed) { return 0; } if (namefile != "") { nameMap = m->readNames(namefile); } - + else if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameMap = ct.getNameMap(); + } + vector positions; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) positions = m->divideFile(qualfile, processors); @@ -257,7 +282,7 @@ int SummaryQualCommand::driverCreateSummary(vector& position, vector& if (current.getName() != "") { int num = 1; - if (namefile != "") { + if ((namefile != "") || (countfile != "")) { //make sure this sequence is in the namefile, else error map::iterator it = nameMap.find(current.getName()); @@ -400,11 +425,14 @@ int SummaryQualCommand::createProcessesCreateSummary(vector& position, vect DWORD dwThreadIdArray[processors]; HANDLE hThreadArray[processors]; + bool hasNameMap = false; + if ((namefile !="") || (countfile != "")) { hasNameMap = true; } + //Create processor worker threads. for( int i=0; i& position, vector< if (m->control_pressed) { out.close(); return 0; } - float average = averageQ[i] / (float) position[i]; + double average = averageQ[i] / (float) position[i]; out << i << '\t' << position[i] << '\t' << average << '\t'; for (int j = 0; j < 41; j++) { diff --git a/summaryqualcommand.h b/summaryqualcommand.h index 31390b4..ac65938 100644 --- a/summaryqualcommand.h +++ b/summaryqualcommand.h @@ -35,7 +35,7 @@ public: private: bool abort; - string qualfile, outputDir, namefile; + string qualfile, outputDir, namefile, countfile; vector outputNames; map nameMap; int processors; @@ -62,20 +62,21 @@ struct seqSumQualData { vector position; vector averageQ; vector< vector > scores; - string filename, namefile; + string filename; unsigned long long start; unsigned long long end; int count; MothurOut* m; + bool hasNameMap; map nameMap; ~seqSumQualData(){} - seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string n, map nam) { + seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, bool n, map nam) { filename = f; m = mout; start = st; end = en; - namefile = n; + hasNameMap = n; nameMap = nam; count = 0; } @@ -109,7 +110,7 @@ static DWORD WINAPI MySeqSumQualThreadFunction(LPVOID lpParam){ if (current.getName() != "") { int num = 1; - if (pDataArray->namefile != "") { + if (pDataArray->hasNameMap) { //make sure this sequence is in the namefile, else error map::iterator it = pDataArray->nameMap.find(current.getName()); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index b29723c..29b8733 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -11,13 +11,15 @@ #include "needlemanoverlap.hpp" #include "trimoligos.h" + //********************************************************************************************************************** vector TrimSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop); @@ -58,11 +60,12 @@ string TrimSeqsCommand::getHelpString(){ string helpString = ""; helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; - helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; helpString += "The oligos parameter allows you to provide an oligos file.\n"; helpString += "The name parameter allows you to provide a names file with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file with your fasta file.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; @@ -111,6 +114,7 @@ string TrimSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ else if (type == "fasta") { outputFileName = "fasta"; } else if (type == "group") { outputFileName = "groups"; } else if (type == "name") { outputFileName = "names"; } + else if (type == "count") { outputFileName = "count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -133,6 +137,7 @@ TrimSeqsCommand::TrimSeqsCommand(){ outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); @@ -171,6 +176,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -208,6 +214,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -279,6 +293,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (temp == "not found") { nameFile = ""; } else if(temp == "not open") { nameFile = ""; abort = true; } else { nameFile = temp; m->setNameFile(nameFile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, qThreshold); @@ -331,10 +352,12 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { abort = true; } - if (nameFile == "") { - vector files; files.push_back(fastaFile); - parser.getNameFile(files); - } + if (countfile == "") { + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } + } } } @@ -385,13 +408,27 @@ int TrimSeqsCommand::execute(){ outputTypes["name"].push_back(trimNameFile); outputTypes["name"].push_back(scrapNameFile); } + + string trimCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "trim." + getOutputFileNameTag("count"); + string scrapCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + "scrap." + getOutputFileNameTag("count"); + + if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameCount = ct.getNameMap(); + outputNames.push_back(trimCountFile); + outputNames.push_back(scrapCountFile); + outputTypes["count"].push_back(trimCountFile); + outputTypes["count"].push_back(scrapCountFile); + } + if (m->control_pressed) { return 0; } string outputGroupFileName; if(oligoFile != ""){ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); - if (createGroup) { + if ((createGroup) && (countfile == "")){ outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + getOutputFileNameTag("group"); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } @@ -401,9 +438,9 @@ int TrimSeqsCommand::execute(){ setLines(fastaFile, qFileName); if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); } @@ -446,35 +483,62 @@ int TrimSeqsCommand::execute(){ for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } } outputNames = outputNames2; - for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { - ifstream in; - m->openInputFile(it->first, in); - - ofstream out; - string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + getOutputFileNameTag("group"); - outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); - m->openOutputFile(thisGroupName, out); - - while (!in.eof()){ - if (m->control_pressed) { break; } - - Sequence currSeq(in); m->gobble(in); - out << currSeq.getName() << '\t' << it->second << endl; + for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { + ifstream in; + m->openInputFile(it->first, in); + + ofstream out; + string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)); + if (countfile == "") { thisGroupName += getOutputFileNameTag("group"); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } + else { thisGroupName += getOutputFileNameTag("count"); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } + m->openOutputFile(thisGroupName, out); + + if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; } + + while (!in.eof()){ + if (m->control_pressed) { break; } - if (nameFile != "") { - map::iterator itName = nameMap.find(currSeq.getName()); - if (itName != nameMap.end()) { - vector thisSeqsNames; - m->splitAtChar(itName->second, thisSeqsNames, ','); - for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self - out << thisSeqsNames[k] << '\t' << it->second << endl; - } - }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + Sequence currSeq(in); m->gobble(in); + if (countfile == "") { + out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + }else { + map::iterator itTotalReps = nameCount.find(currSeq.getName()); + if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } } - } - in.close(); - out.close(); - } + } + in.close(); + out.close(); + } + + if (countfile != "") { //create countfile with group info included + CountTable* ct = new CountTable(); + ct->readTable(trimCountFile); + map justTrimmedNames = ct->getNameMap(); + delete ct; + + CountTable newCt; + for (map::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); } + vector tempCounts; tempCounts.resize(groupCounts.size(), 0); + for (map::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) { + newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance + map::iterator it2 = groupMap.find(itNames->first); + if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); } + else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; } + } + newCt.printTable(trimCountFile); + } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -511,6 +575,11 @@ int TrimSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -527,8 +596,7 @@ int TrimSeqsCommand::execute(){ } /**************************************************************************************/ - -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -552,9 +620,16 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string m->openOutputFile(scrapNFileName, scrapNameFile); } + ofstream trimCountFile; + ofstream scrapCountFile; + if(countfile != ""){ + m->openOutputFile(trimCFileName, trimCountFile); + m->openOutputFile(scrapCFileName, scrapCountFile); + if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } ofstream outGroupsFile; - if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); } + if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); } if(allFiles){ for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file @@ -591,14 +666,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (createGroup) { outGroupsFile.close(); } - - if(qFileName != ""){ - qFile.close(); - } - for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } - - return 0; + if ((createGroup) && (countfile == "")) { outGroupsFile.close(); } + if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int success = 1; @@ -720,6 +792,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + + int numRedundants = 0; + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } if (createGroup) { if(barcodes.size() != 0){ @@ -736,9 +817,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { groupMap[currSeq.getName()] = thisGroup; } - int numRedundants = 0; if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { @@ -786,6 +867,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -819,6 +907,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (createGroup) { outGroupsFile.close(); } if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } return count; } @@ -830,7 +919,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { int process = 1; @@ -881,6 +970,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName (scrapQualFileName + toString(getpid()) + ".temp"), (trimNameFileName + toString(getpid()) + ".temp"), (scrapNameFileName + toString(getpid()) + ".temp"), + (trimCountFileName + toString(getpid()) + ".temp"), + (scrapCountFileName + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, @@ -901,6 +992,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { out << it->first << '\t' << it->second << endl; } + + out << groupMap.size() << endl; + for (map::iterator it = groupMap.begin(); it != groupMap.end(); it++) { + out << it->first << '\t' << it->second << endl; + } out.close(); } exit(0); @@ -923,8 +1019,12 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->openOutputFile(trimNameFileName, temp); temp.close(); m->openOutputFile(scrapNameFileName, temp); temp.close(); } + if (countfile != "") { + m->openOutputFile(trimCountFileName, temp); temp.close(); + m->openOutputFile(scrapCountFileName, temp); temp.close(); + } - driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); //force parent to wait until all the processes are done for (int i=0;iopenOutputFile(scrapNameFileName, temp); temp.close(); } - driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]); processIDS.push_back(processors-1); @@ -1022,6 +1124,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } else { groupCounts[it->first] += it->second; } } + for (map::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) { + map::iterator it2 = groupMap.find(it->first); + if (it2 == groupMap.end()) { groupMap[it->first] = it->second; } + else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } CloseHandle(hThreadArray[i]); delete pDataArray[i]; } @@ -1052,8 +1159,15 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName); m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp")); } + + if(countfile != ""){ + m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName); + m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName); + m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp")); + } - if(createGroup){ + if((createGroup)&&(countfile == "")){ m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp")); } @@ -1091,14 +1205,27 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName in >> tempNum; m->gobble(in); if (tempNum != 0) { - while (!in.eof()) { - in >> group >> tempNum; m->gobble(in); + for (int i = 0; i < tempNum; i++) { + int groupNum; + in >> group >> groupNum; m->gobble(in); map::iterator it = groupCounts.find(group); - if (it == groupCounts.end()) { groupCounts[group] = tempNum; } - else { groupCounts[it->first] += tempNum; } + if (it == groupCounts.end()) { groupCounts[group] = groupNum; } + else { groupCounts[it->first] += groupNum; } } } + in >> tempNum; m->gobble(in); + if (tempNum != 0) { + for (int i = 0; i < tempNum; i++) { + string group, seqName; + in >> seqName >> group; m->gobble(in); + + map::iterator it = groupMap.find(seqName); + if (it == groupMap.end()) { groupMap[seqName] = group; } + else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + } + in.close(); m->mothurRemove(tempFile); } #endif @@ -1387,6 +1514,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< string fastaFileName = ""; string qualFileName = ""; string nameFileName = ""; + string countFileName = ""; if(primerName == ""){ comboGroupName = barcodeNameVector[itBar->second]; @@ -1433,7 +1561,6 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< nameFileNames[itBar->second][itPrimer->second] = nameFileName; m->openOutputFile(nameFileName, temp); temp.close(); } - } } } diff --git a/trimseqscommand.h b/trimseqscommand.h index 957f37a..8d9a57a 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -14,8 +14,8 @@ #include "command.hpp" #include "sequence.hpp" #include "qualityscores.h" -#include "groupmap.h" #include "trimoligos.h" +#include "counttable.h" class TrimSeqsCommand : public Command { @@ -36,16 +36,13 @@ public: void help() { m->mothurOut(getHelpString()); } private: - - GroupMap* groupMap; - struct linePair { unsigned long long start; unsigned long long end; linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} linePair() {} }; - + bool getOligos(vector >&, vector >&, vector >&); bool keepFirstTrim(Sequence&, QualityScores&); bool removeLastTrim(Sequence&, QualityScores&); @@ -55,7 +52,7 @@ private: string reverseOligo(string); bool abort, createGroup; - string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; + string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir; bool flip, allFiles, qtrim, keepforward; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; @@ -75,13 +72,15 @@ private: vector barcodeNameVector; //needed here? map groupCounts; map nameMap; + map nameCount; //for countfile name -> repCount + map groupMap; //for countfile name -> group vector processIDS; //processid vector lines; vector qLines; - int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); - int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); + int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >, linePair, linePair); + int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector >, vector >, vector >); int setLines(string, string); }; @@ -92,7 +91,7 @@ private: struct trimData { unsigned long long start, end; MothurOut* m; - string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile; + string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; @@ -105,6 +104,7 @@ struct trimData { map barcodes; map rbarcodes; map primers; + map nameCount; vector linker; vector spacer; map combos; @@ -112,22 +112,26 @@ struct trimData { vector barcodeNameVector; map groupCounts; map nameMap; + map groupMap; trimData(){} - trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, + trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, int pd, int bd, int ld, int sd, int td, map pri, map bar, map rbar, vector revP, vector li, vector spa, vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, - int minL, int maxA, int maxH, int maxL, bool fli, map nm) { + int minL, int maxA, int maxH, int maxL, bool fli, map nm, map ncount) { filename = fn; qFileName = qn; nameFile = nf; + countfile = cf; trimFileName = tn; scrapFileName = sn; trimQFileName = tqn; scrapQFileName = sqn; trimNFileName = tnn; scrapNFileName = snn; + trimCFileName = tcn; + scrapCFileName = scn; groupFileName = gn; fastaFileNames = ffn; qualFileNames = qfn; @@ -137,6 +141,7 @@ struct trimData { qlineStart = qstart; qlineEnd = qend; m = mout; + nameCount = ncount; pdiffs = pd; bdiffs = bd; @@ -203,7 +208,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ ofstream outGroupsFile; - if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); } if(pDataArray->allFiles){ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file @@ -222,6 +227,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } + ofstream trimCountFile; + ofstream scrapCountFile; + if(pDataArray->countfile != ""){ + pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile); + pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile); + if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } + ifstream inFASTA; pDataArray->m->openInputFile(pDataArray->filename, inFASTA); if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { @@ -248,7 +261,11 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if (pDataArray->m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (pDataArray->createGroup) { outGroupsFile.close(); } + if ((pDataArray->createGroup) && (pDataArray->countfile == "")) { outGroupsFile.close(); } + if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(pDataArray->countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + if(pDataArray->qFileName != ""){ qFile.close(); } return 0; } @@ -399,6 +416,15 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } } + int numRedundants = 0; + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } + if (pDataArray->createGroup) { if(pDataArray->barcodes.size() != 0){ string thisGroup = pDataArray->barcodeNameVector[barcodeIndex]; @@ -412,9 +438,9 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ } } - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { pDataArray->groupMap[currSeq.getName()] = thisGroup; } - int numRedundants = 0; if (pDataArray->nameFile != "") { map::iterator itName = pDataArray->nameMap.find(currSeq.getName()); if (itName != pDataArray->nameMap.end()) { @@ -462,6 +488,12 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); } } + if (pDataArray->countfile != "") { + map::iterator itCount = pDataArray->nameCount.find(currSeq.getName()); + if (itCount != pDataArray->nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); } + } currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); -- 2.39.2