From: Sarah Westcott Date: Fri, 25 Jan 2013 20:46:33 +0000 (-0500) Subject: added primer.design command. fixed bug with linux unifrac subsampling, metastats... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=4b54ce99af7db8019ea907cd7c2edf789369ada9 added primer.design command. fixed bug with linux unifrac subsampling, metastats output filename, venn command is no shared outs, added check for ':' in sequence names to avoid trouble with trees. added format parameter to make.contigs. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index ea4f7ed..6fed2b9 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -39,6 +39,7 @@ A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; }; A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */; }; A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; }; + A74C06E916A9C0A9008390A3 /* primerdesigncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */; }; A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A74D59A4159A1E2000043046 /* counttable.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D59A3159A1E2000043046 /* counttable.cpp */; }; A754149714840CF7005850D1 /* summaryqualcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A754149614840CF7005850D1 /* summaryqualcommand.cpp */; }; @@ -448,6 +449,8 @@ A7496D2D167B531B00CC7D7C /* kruskalwalliscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kruskalwalliscommand.h; sourceTree = ""; }; A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = ""; }; A74A9A9E148E881E00AB5E3E /* spline.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = spline.cpp; sourceTree = ""; }; + A74C06E616A9C097008390A3 /* primerdesigncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = primerdesigncommand.h; sourceTree = ""; }; + A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = primerdesigncommand.cpp; sourceTree = ""; }; A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = ""; }; A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerauchimecommand.cpp; sourceTree = ""; }; A74D59A3159A1E2000043046 /* counttable.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = counttable.cpp; sourceTree = ""; }; @@ -1455,6 +1458,8 @@ A7E9B79512D37EC400DA6239 /* pipelinepdscommand.cpp */, A7E9B79812D37EC400DA6239 /* preclustercommand.h */, A7E9B79712D37EC400DA6239 /* preclustercommand.cpp */, + A74C06E616A9C097008390A3 /* primerdesigncommand.h */, + A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */, A7E9B7A212D37EC400DA6239 /* quitcommand.h */, A7E9B7A112D37EC400DA6239 /* quitcommand.cpp */, A7E9B7AC12D37EC400DA6239 /* rarefactcommand.h */, @@ -2310,6 +2315,7 @@ 834D9D5C1656DEC800E7FAB9 /* regularizeddecisiontree.cpp in Sources */, A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */, A79EEF8616971D4A0006DEC1 /* filtersharedcommand.cpp in Sources */, + A74C06E916A9C0A9008390A3 /* primerdesigncommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 6ec2d38..114036b 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -782,7 +782,7 @@ int ClassifySeqsCommand::execute(){ } #endif - if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur suspects some of your sequences may be reversed, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); + if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur reversed some your sequences for a better classification. If you would like to take a closer look, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); }else { m->mothurRemove(newaccnosFile); } diff --git a/commandfactory.cpp b/commandfactory.cpp index c8421d2..ce51d72 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -137,6 +137,7 @@ #include "sffmultiplecommand.h" #include "classifysharedcommand.h" #include "filtersharedcommand.h" +#include "primerdesigncommand.h" /*******************************************************/ @@ -297,6 +298,7 @@ CommandFactory::CommandFactory(){ commands["quit"] = "MPIEnabled"; commands["classify.shared"] = "classify.shared"; commands["filter.shared"] = "filter.shared"; + commands["primer.design"] = "primer.design"; } @@ -513,6 +515,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "sff.multiple") { command = new SffMultipleCommand(optionString); } else if(commandName == "classify.shared") { command = new ClassifySharedCommand(optionString); } else if(commandName == "filter.shared") { command = new FilterSharedCommand(optionString); } + else if(commandName == "primer.design") { command = new PrimerDesignCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -670,6 +673,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "sff.multiple") { pipecommand = new SffMultipleCommand(optionString); } else if(commandName == "classify.shared") { pipecommand = new ClassifySharedCommand(optionString); } else if(commandName == "filter.shared") { pipecommand = new FilterSharedCommand(optionString); } + else if(commandName == "primer.design") { pipecommand = new PrimerDesignCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -813,6 +817,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "sff.multiple") { shellcommand = new SffMultipleCommand(); } else if(commandName == "classify.shared") { shellcommand = new ClassifySharedCommand(); } else if(commandName == "filter.shared") { shellcommand = new FilterSharedCommand(); } + else if(commandName == "primer.design") { shellcommand = new PrimerDesignCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/consensusseqscommand.cpp b/consensusseqscommand.cpp index f0fc6bf..9bfdc9c 100644 --- a/consensusseqscommand.cpp +++ b/consensusseqscommand.cpp @@ -219,6 +219,8 @@ int ConsensusSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } + int start = time(NULL); + readFasta(); if (m->control_pressed) { return 0; } @@ -391,6 +393,8 @@ int ConsensusSeqsCommand::execute(){ delete input; } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the consensus sequences."); + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 3bafbcd..dfa012e 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -236,7 +236,11 @@ int CountSeqsCommand::processSmall(string outputFileName){ string firstCol, secondCol; in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); - + //cout << firstCol << '\t' << secondCol << endl; + m->checkName(firstCol); + m->checkName(secondCol); + //cout << firstCol << '\t' << secondCol << endl; + vector names; m->splitAtChar(secondCol, names, ','); @@ -435,6 +439,8 @@ map CountSeqsCommand::processNameFile(string name) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + m->checkName(firstCol); + m->checkName(secondCol); //parse names into vector vector theseNames; m->splitAtComma(secondCol, theseNames); @@ -456,6 +462,8 @@ map CountSeqsCommand::processNameFile(string name) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + m->checkName(firstCol); + m->checkName(secondCol); //parse names into vector vector theseNames; m->splitAtComma(secondCol, theseNames); @@ -507,6 +515,7 @@ map CountSeqsCommand::getGroupNames(string filename, set& n else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + m->checkName(firstCol); it = groupIndex.find(secondCol); if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above groupIndex[secondCol] = count; @@ -529,6 +538,7 @@ map CountSeqsCommand::getGroupNames(string filename, set& n else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + m->checkName(firstCol); it = groupIndex.find(secondCol); if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above groupIndex[secondCol] = count; diff --git a/counttable.cpp b/counttable.cpp index 2ab0e34..ad0b2da 100644 --- a/counttable.cpp +++ b/counttable.cpp @@ -131,6 +131,9 @@ int CountTable::createTable(string namefile, string groupfile, bool createGroup) string firstCol, secondCol; in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); + m->checkName(firstCol); + m->checkName(secondCol); + vector names; m->splitAtChar(secondCol, names, ','); diff --git a/engine.cpp b/engine.cpp index 48f782a..e4e1071 100644 --- a/engine.cpp +++ b/engine.cpp @@ -65,7 +65,9 @@ bool InteractEngine::getInput(){ if (pid == 0) { #endif - + + if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); } + mout->mothurOutEndLine(); input = getCommand(); @@ -115,6 +117,7 @@ bool InteractEngine::getInput(){ //cout << pid << " is in execute " << commandName << endl; #endif //executes valid command + mout->changedSeqNames = false; mout->runParse = true; mout->clearGroups(); mout->clearAllGroups(); @@ -276,7 +279,7 @@ bool BatchEngine::getInput(){ if (input[0] != '#') { - + if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); } mout->mothurOutEndLine(); mout->mothurOut("mothur > " + input); mout->mothurOutEndLine(); @@ -300,6 +303,7 @@ bool BatchEngine::getInput(){ if ((cFactory->MPIEnabled(commandName)) || (pid == 0)) { #endif //executes valid command + mout->changedSeqNames = false; mout->runParse = true; mout->clearGroups(); mout->clearAllGroups(); @@ -413,6 +417,8 @@ bool ScriptEngine::getInput(){ input = getNextCommand(listOfCommands); if (input == "") { input = "quit()"; } + + if (mout->changedSeqNames) { mout->mothurOut("[WARNING]: your sequence names contained ':'. I changed them to '_' to avoid problems in your downstream analysis.\n"); } if (mout->gui) { if ((input.find("quit") != string::npos) || (input.find("set.logfile") != string::npos)) {} @@ -468,6 +474,7 @@ bool ScriptEngine::getInput(){ //cout << pid << " is in execute" << endl; #endif //executes valid command + mout->changedSeqNames = false; mout->runParse = true; mout->clearGroups(); mout->clearAllGroups(); diff --git a/flowdata.cpp b/flowdata.cpp index 1fe7d7f..5dc7dc3 100644 --- a/flowdata.cpp +++ b/flowdata.cpp @@ -42,15 +42,14 @@ FlowData::FlowData(int numFlows, float signal, float noise, int maxHomoP, string bool FlowData::getNext(ifstream& flowFile){ try { - flowFile >> seqName >> endFlow; - if (seqName.length() != 0) { - //cout << "in Flowdata " + seqName << endl; + seqName = getSequenceName(flowFile); + flowFile >> endFlow; + if (!m->control_pressed) { for(int i=0;i> flowData[i]; } - //cout << "in Flowdata read " << seqName + " done" << endl; updateEndFlow(); translateFlow(); m->gobble(flowFile); - }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } if(flowFile){ return 1; } else { return 0; } @@ -61,6 +60,26 @@ bool FlowData::getNext(ifstream& flowFile){ } } +//******************************************************************************************************************** +string FlowData::getSequenceName(ifstream& flowFile) { + try { + string name = ""; + + flowFile >> name; + + if (name.length() != 0) { + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } + }else{ m->mothurOut("Error in reading your flowfile, at position " + toString(flowFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; } + + return name; + } + catch(exception& e) { + m->errorOut(e, "FlowData", "getSequenceName"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/flowdata.h b/flowdata.h index 1007653..c7fd08a 100644 --- a/flowdata.h +++ b/flowdata.h @@ -38,6 +38,7 @@ private: string seqName, locationString, sequence, baseFlow; int numFlows, maxFlows, endFlow; vector flowData; + string getSequenceName(ifstream&); }; #endif diff --git a/groupmap.cpp b/groupmap.cpp index fb2495c..e5d8427 100644 --- a/groupmap.cpp +++ b/groupmap.cpp @@ -20,7 +20,6 @@ /************************************************************/ GroupMap::~GroupMap(){} - /************************************************************/ int GroupMap::readMap() { try { @@ -45,6 +44,7 @@ int GroupMap::readMap() { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -69,7 +69,7 @@ int GroupMap::readMap() { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your groupfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -114,7 +114,7 @@ int GroupMap::readDesignMap() { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -139,7 +139,7 @@ int GroupMap::readDesignMap() { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -188,7 +188,7 @@ int GroupMap::readMap(string filename) { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -213,7 +213,7 @@ int GroupMap::readMap(string filename) { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your group file contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -261,7 +261,7 @@ int GroupMap::readDesignMap(string filename) { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -286,7 +286,7 @@ int GroupMap::readDesignMap(string filename) { setNamesOfGroups(seqGroup); if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "', group = '" + seqGroup + "'\n"); } - + m->checkName(seqName); it = groupmap.find(seqName); if (it != groupmap.end()) { error = 1; m->mothurOut("Your designfile contains more than 1 sequence named " + seqName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } @@ -325,7 +325,7 @@ string GroupMap::getGroup(string sequenceName) { void GroupMap::setGroup(string sequenceName, string groupN) { setNamesOfGroups(groupN); - + m->checkName(sequenceName); it = groupmap.find(sequenceName); if (it != groupmap.end()) { m->mothurOut("Your groupfile contains more than 1 sequence named " + sequenceName + ", sequence names must be unique. Please correct."); m->mothurOutEndLine(); } diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 32e2d68..4240699 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -33,6 +33,7 @@ vector MakeContigsCommand::setParameters(){ CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); CommandParameter pthreshold("threshold", "Number", "", "40", "", "", "","",false,false); parameters.push_back(pthreshold); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pformat("format", "Multiple", "sanger-illumina-solexa", "sanger", "", "", "","",false,false,true); parameters.push_back(pformat); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -51,13 +52,14 @@ string MakeContigsCommand::getHelpString(){ string helpString = ""; helpString += "The make.contigs command reads a file, forward fastq file and a reverse fastq file or forward fasta and reverse fasta files and outputs new fasta. It will also provide new quality files if the fastq or file parameter is used.\n"; helpString += "If an oligos file is provided barcodes and primers will be trimmed, and a group file will be created.\n"; - helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n"; + helpString += "The make.contigs command parameters are ffastq, rfastq, oligos, format, tdiffs, bdiffs, ldiffs, sdiffs, pdiffs, align, match, mismatch, gapopen, gapextend, allfiles and processors.\n"; helpString += "The ffastq and rfastq, file, or ffasta and rfasta parameters are required.\n"; helpString += "The file parameter is 2 column file containing the forward fastq files in the first column and their matching reverse fastq files in the second column. Mothur will process each pair and create a combined fasta and qual file with all the sequences.\n"; helpString += "The ffastq and rfastq parameters are used to provide a forward fastq and reverse fastq file to process. If you provide one, you must provide the other.\n"; helpString += "The ffasta and rfasta parameters are used to provide a forward fasta and reverse fasta file to process. If you provide one, you must provide the other.\n"; helpString += "The fqfile and rqfile parameters are used to provide a forward quality and reverse quality files to process with the ffasta and rfasta parameters. If you provide one, you must provide the other.\n"; - helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; + helpString += "The format parameter is used to indicate whether your sequences are sanger, solexa or illumina, default=sanger.\n"; + helpString += "The align parameter allows you to specify the alignment method to use. Your options are: gotoh and needleman. The default is needleman.\n"; helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; @@ -317,6 +319,19 @@ MakeContigsCommand::MakeContigsCommand(string option) { align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } if ((align != "needleman") && (align != "gotoh")) { m->mothurOut(align + " is not a valid alignment method. Options are needleman or gotoh. I will use needleman."); m->mothurOutEndLine(); align = "needleman"; } + + format = validParameter.validFile(parameters, "format", false); if (format == "not found"){ format = "sanger"; } + + if ((format != "sanger") && (format != "illumina") && (format != "solexa")) { + m->mothurOut(format + " is not a valid format. Your format choices are sanger, solexa and illumina, aborting." ); m->mothurOutEndLine(); + abort=true; + } + + //fill convert table - goes from solexa to sanger. Used fq_all2std.pl as a reference. + for (int i = -64; i < 65; i++) { + char temp = (char) ((int)(33 + 10*log(1+pow(10,(i/10.0)))/log(10)+0.499)); + convertTable.push_back(temp); + } } } @@ -1508,15 +1523,8 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ if (name2 != "") { if (name != name2) { m->mothurOut("[WARNING]: names do not match. read " + name + " for fasta and " + name2 + " for quality, ignoring."); ignore=true; } } if (quality.length() != sequence.length()) { m->mothurOut("[WARNING]: Lengths do not match for sequence " + name + ". Read " + toString(sequence.length()) + " characters for fasta and " + toString(quality.length()) + " characters for quality scores, ignoring read."); ignore=true; } - vector qualScores; - int controlChar = int('!'); - for (int i = 0; i < quality.length(); i++) { - int temp = int(quality[i]); - temp -= controlChar; - - qualScores.push_back(temp); - } - + vector qualScores = convertQual(quality); + read.name = name; read.sequence = sequence; read.scores = qualScores; @@ -1912,6 +1920,34 @@ string MakeContigsCommand::reverseOligo(string oligo){ } } //********************************************************************************************************************** +vector MakeContigsCommand::convertQual(string qual) { + try { + vector qualScores; + + for (int i = 0; i < qual.length(); i++) { + + int temp = 0; + temp = int(qual[i]); + if (format == "illumina") { + temp -= 64; //char '@' + }else if (format == "solexa") { + temp = int(convertTable[temp]); //convert to sanger + temp -= int('!'); //char '!' + }else { + temp -= int('!'); //char '!' + } + qualScores.push_back(temp); + } + + return qualScores; + } + catch(exception& e) { + m->errorOut(e, "MakeContigsCommand", "convertQual"); + exit(1); + } +} + +//********************************************************************************************************************** diff --git a/makecontigscommand.h b/makecontigscommand.h index 2732d68..43105b8 100644 --- a/makecontigscommand.h +++ b/makecontigscommand.h @@ -60,7 +60,7 @@ public: private: bool abort, allFiles, createGroup; - string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file; + string outputDir, ffastqfile, rfastqfile, align, oligosfile, rfastafile, ffastafile, rqualfile, fqualfile, file, format; float match, misMatch, gapOpen, gapExtend; int processors, longestBase, threshold, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; vector outputNames; @@ -70,11 +70,13 @@ private: vector linker; vector spacer; vector primerNameVector; - vector barcodeNameVector; + vector barcodeNameVector; + vector convertTable; map groupCounts; map groupMap; + vector convertQual(string); fastqRead readFastq(ifstream&, bool&); vector< vector< vector > > preProcessData(unsigned long int&); vector< vector > readFileNames(string); diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index d2c29bd..3844e18 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -691,70 +691,10 @@ int MatrixOutputCommand::process(vector thisLookup){ if (iters != 0) { //we need to find the average distance and standard deviation for each groups distance + vector< vector > calcAverages = m->getAverages(calcDistsTotals, mode); - vector< vector > calcAverages; calcAverages.resize(matrixCalculators.size()); - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - calcAverages[i].resize(calcDistsTotals[0][i].size()); - - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].seq1 = calcDistsTotals[0][i][j].seq1; - calcAverages[i][j].seq2 = calcDistsTotals[0][i][j].seq2; - calcAverages[i][j].dist = 0.0; - } - } - if (mode == "average") { - for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; - if (m->debug) { m->mothurOut("[DEBUG]: Totaling for average calc: iter = " + toString(thisIter) + ", " + thisLookup[calcDistsTotals[thisIter][i][j].seq1]->getGroup() + " - " + thisLookup[calcDistsTotals[thisIter][i][j].seq2]->getGroup() + " distance = " + toString(calcDistsTotals[thisIter][i][j].dist) + ". New total = " + toString(calcAverages[i][j].dist) + ".\n"); } - } - } - } - - for (int i = 0; i < calcAverages.size(); i++) { //finds average. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist /= (float) iters; - } - } - }else { //find median - for (int i = 0; i < calcAverages.size(); i++) { //for each calc - for (int j = 0; j < calcAverages[i].size(); j++) { //for each comparison - vector dists; - for (int thisIter = 0; thisIter < iters; thisIter++) { //for each subsample - dists.push_back(calcDistsTotals[thisIter][i][j].dist); - } - sort(dists.begin(), dists.end()); - calcAverages[i][j].dist = dists[(iters/2)]; - } - } - } //find standard deviation - vector< vector > stdDev; stdDev.resize(matrixCalculators.size()); - for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero. - stdDev[i].resize(calcDistsTotals[0][i].size()); - - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].seq1 = calcDistsTotals[0][i][j].seq1; - stdDev[i][j].seq2 = calcDistsTotals[0][i][j].seq2; - stdDev[i][j].dist = 0.0; - } - } - - for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each - for (int i = 0; i < stdDev.size(); i++) { - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist)); - } - } - } - - for (int i = 0; i < stdDev.size(); i++) { //finds average. - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].dist /= (float) iters; - stdDev[i][j].dist = sqrt(stdDev[i][j].dist); - } - } + vector< vector > stdDev = m->getStandardDeviation(calcDistsTotals, calcAverages); //print results for (int i = 0; i < calcDists.size(); i++) { diff --git a/metastatscommand.cpp b/metastatscommand.cpp index 3eaee96..8e61b37 100644 --- a/metastatscommand.cpp +++ b/metastatscommand.cpp @@ -65,7 +65,7 @@ string MetaStatsCommand::getOutputPattern(string type) { try { string pattern = ""; - if (type == "metastats") { pattern = "[filename],[distance],[groups],metastats"; } + if (type == "metastats") { pattern = "[filename],[distance],[group],metastats"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; diff --git a/mothurout.cpp b/mothurout.cpp index 240e7ec..54cdd33 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1606,6 +1606,7 @@ int MothurOut::readTax(string namefile, map& taxMap) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); //are there confidence scores, if so remove them if (secondCol.find_first_of('(') != -1) { removeConfidences(secondCol); } map::iterator itTax = taxMap.find(firstCol); @@ -1633,6 +1634,7 @@ int MothurOut::readTax(string namefile, map& taxMap) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); //are there confidence scores, if so remove them if (secondCol.find_first_of('(') != -1) { removeConfidences(secondCol); } map::iterator itTax = taxMap.find(firstCol); @@ -1684,6 +1686,9 @@ int MothurOut::readNames(string namefile, map& nameMap, bool red else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); + //parse names into vector vector theseNames; splitAtComma(secondCol, theseNames); @@ -1702,10 +1707,13 @@ int MothurOut::readNames(string namefile, map& nameMap, bool red else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); + //parse names into vector vector theseNames; splitAtComma(secondCol, theseNames); - for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; } + for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = firstCol; } pairDone = false; } } @@ -1743,6 +1751,8 @@ int MothurOut::readNames(string namefile, map& nameMap, int flip else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); nameMap[secondCol] = firstCol; pairDone = false; } @@ -1758,6 +1768,8 @@ int MothurOut::readNames(string namefile, map& nameMap, int flip else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); nameMap[secondCol] = firstCol; pairDone = false; } @@ -1797,6 +1809,8 @@ int MothurOut::readNames(string namefile, map& nameMap, map theseNames; splitAtComma(secondCol, theseNames); @@ -1816,6 +1830,8 @@ int MothurOut::readNames(string namefile, map& nameMap, map theseNames; splitAtComma(secondCol, theseNames); @@ -1857,7 +1873,10 @@ int MothurOut::readNames(string namefile, map& nameMap) { if (columnOne) { firstCol = pieces[i]; columnOne=false; } else { secondCol = pieces[i]; pairDone = true; columnOne=true; } - if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; } + if (pairDone) { + checkName(firstCol); + checkName(secondCol); + nameMap[firstCol] = secondCol; pairDone = false; } } } in.close(); @@ -1869,7 +1888,10 @@ int MothurOut::readNames(string namefile, map& nameMap) { if (columnOne) { firstCol = pieces[i]; columnOne=false; } else { secondCol = pieces[i]; pairDone = true; columnOne=true; } - if (pairDone) { nameMap[firstCol] = secondCol; pairDone = false; } + if (pairDone) { + checkName(firstCol); + checkName(secondCol); + nameMap[firstCol] = secondCol; pairDone = false; } } } @@ -1905,6 +1927,8 @@ int MothurOut::readNames(string namefile, map >& nameMap) else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); vector temp; splitAtComma(secondCol, temp); nameMap[firstCol] = temp; @@ -1922,6 +1946,8 @@ int MothurOut::readNames(string namefile, map >& nameMap) else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); vector temp; splitAtComma(secondCol, temp); nameMap[firstCol] = temp; @@ -1963,9 +1989,73 @@ map MothurOut::readNames(string namefile) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); + int num = getNumNames(secondCol); + nameMap[firstCol] = num; + pairDone = false; + } + } + } + in.close(); + + if (rest != "") { + vector pieces = splitWhiteSpace(rest); + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + checkName(firstCol); + checkName(secondCol); + int num = getNumNames(secondCol); + nameMap[firstCol] = num; + pairDone = false; + } + } + } + + return nameMap; + + } + catch(exception& e) { + errorOut(e, "MothurOut", "readNames"); + exit(1); + } +} +/**********************************************************************************************************************/ +map MothurOut::readNames(string namefile, unsigned long int& numSeqs) { + try { + map nameMap; + numSeqs = 0; + + //open input file + ifstream in; + openInputFile(namefile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + + while (!in.eof()) { + if (control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + checkName(firstCol); + checkName(secondCol); int num = getNumNames(secondCol); nameMap[firstCol] = num; pairDone = false; + numSeqs += num; } } } @@ -1978,9 +2068,12 @@ map MothurOut::readNames(string namefile) { else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); int num = getNumNames(secondCol); nameMap[firstCol] = num; pairDone = false; + numSeqs += num; } } } @@ -1993,6 +2086,19 @@ map MothurOut::readNames(string namefile) { exit(1); } } +/************************************************************/ +int MothurOut::checkName(string& name) { + try { + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; changedSeqNames = true; } + } + return 0; + } + catch(exception& e) { + errorOut(e, "MothurOut", "checkName"); + exit(1); + } +} /**********************************************************************************************************************/ int MothurOut::readNames(string namefile, vector& nameVector, map& fastamap) { try { @@ -2019,6 +2125,8 @@ int MothurOut::readNames(string namefile, vector& nameVector, m else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); int num = getNumNames(secondCol); map::iterator it = fastamap.find(firstCol); @@ -2044,6 +2152,8 @@ int MothurOut::readNames(string namefile, vector& nameVector, m else { secondCol = pieces[i]; pairDone = true; columnOne=true; } if (pairDone) { + checkName(firstCol); + checkName(secondCol); int num = getNumNames(secondCol); map::iterator it = fastamap.find(firstCol); @@ -2083,13 +2193,13 @@ set MothurOut::readAccnos(string accnosfile){ in.read(buffer, 4096); vector pieces = splitWhiteSpace(rest, buffer, in.gcount()); - for (int i = 0; i < pieces.size(); i++) { names.insert(pieces[i]); } + for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.insert(pieces[i]); } } in.close(); if (rest != "") { vector pieces = splitWhiteSpace(rest); - for (int i = 0; i < pieces.size(); i++) { names.insert(pieces[i]); } + for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.insert(pieces[i]); } } return names; } @@ -2115,13 +2225,13 @@ int MothurOut::readAccnos(string accnosfile, vector& names){ in.read(buffer, 4096); vector pieces = splitWhiteSpace(rest, buffer, in.gcount()); - for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); } + for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.push_back(pieces[i]); } } in.close(); if (rest != "") { vector pieces = splitWhiteSpace(rest); - for (int i = 0; i < pieces.size(); i++) { names.push_back(pieces[i]); } + for (int i = 0; i < pieces.size(); i++) { checkName(pieces[i]); names.push_back(pieces[i]); } } return 0; @@ -2997,6 +3107,180 @@ vector MothurOut::getStandardDeviation(vector< vector >& dists, exit(1); } } +/**************************************************************************************************/ +vector< vector > MothurOut::getAverages(vector< vector< vector > >& calcDistsTotals, string mode) { + try{ + + vector< vector > calcAverages; //calcAverages.resize(calcDistsTotals[0].size()); + for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero. + //calcAverages[i].resize(calcDistsTotals[0][i].size()); + vector temp; + for (int j = 0; j < calcDistsTotals[0][i].size(); j++) { + seqDist tempDist; + tempDist.seq1 = calcDistsTotals[0][i][j].seq1; + tempDist.seq2 = calcDistsTotals[0][i][j].seq2; + tempDist.dist = 0.0; + temp.push_back(tempDist); + } + calcAverages.push_back(temp); + } + + if (mode == "average") { + for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; + } + } + } + + for (int i = 0; i < calcAverages.size(); i++) { //finds average. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist /= (float) calcDistsTotals.size(); + } + } + }else { //find median + for (int i = 0; i < calcAverages.size(); i++) { //for each calc + for (int j = 0; j < calcAverages[i].size(); j++) { //for each comparison + vector dists; + for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //for each subsample + dists.push_back(calcDistsTotals[thisIter][i][j].dist); + } + sort(dists.begin(), dists.end()); + calcAverages[i][j].dist = dists[(calcDistsTotals.size()/2)]; + } + } + } + + return calcAverages; + } + catch(exception& e) { + errorOut(e, "MothurOut", "getAverages"); + exit(1); + } +} +/**************************************************************************************************/ +vector< vector > MothurOut::getAverages(vector< vector< vector > >& calcDistsTotals) { + try{ + + vector< vector > calcAverages; //calcAverages.resize(calcDistsTotals[0].size()); + for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero. + //calcAverages[i].resize(calcDistsTotals[0][i].size()); + vector temp; + for (int j = 0; j < calcDistsTotals[0][i].size(); j++) { + seqDist tempDist; + tempDist.seq1 = calcDistsTotals[0][i][j].seq1; + tempDist.seq2 = calcDistsTotals[0][i][j].seq2; + tempDist.dist = 0.0; + temp.push_back(tempDist); + } + calcAverages.push_back(temp); + } + + + for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; + } + } + } + + for (int i = 0; i < calcAverages.size(); i++) { //finds average. + for (int j = 0; j < calcAverages[i].size(); j++) { + calcAverages[i][j].dist /= (float) calcDistsTotals.size(); + } + } + + return calcAverages; + } + catch(exception& e) { + errorOut(e, "MothurOut", "getAverages"); + exit(1); + } +} +/**************************************************************************************************/ +vector< vector > MothurOut::getStandardDeviation(vector< vector< vector > >& calcDistsTotals) { + try{ + + vector< vector > calcAverages = getAverages(calcDistsTotals); + + //find standard deviation + vector< vector > stdDev; + for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero. + vector temp; + for (int j = 0; j < calcDistsTotals[0][i].size(); j++) { + seqDist tempDist; + tempDist.seq1 = calcDistsTotals[0][i][j].seq1; + tempDist.seq2 = calcDistsTotals[0][i][j].seq2; + tempDist.dist = 0.0; + temp.push_back(tempDist); + } + stdDev.push_back(temp); + } + + for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdDev.size(); i++) { + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist)); + } + } + } + + for (int i = 0; i < stdDev.size(); i++) { //finds average. + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist /= (float) calcDistsTotals.size(); + stdDev[i][j].dist = sqrt(stdDev[i][j].dist); + } + } + + return stdDev; + } + catch(exception& e) { + errorOut(e, "MothurOut", "getAverages"); + exit(1); + } +} +/**************************************************************************************************/ +vector< vector > MothurOut::getStandardDeviation(vector< vector< vector > >& calcDistsTotals, vector< vector >& calcAverages) { + try{ + //find standard deviation + vector< vector > stdDev; + for (int i = 0; i < calcDistsTotals[0].size(); i++) { //initialize sums to zero. + vector temp; + for (int j = 0; j < calcDistsTotals[0][i].size(); j++) { + seqDist tempDist; + tempDist.seq1 = calcDistsTotals[0][i][j].seq1; + tempDist.seq2 = calcDistsTotals[0][i][j].seq2; + tempDist.dist = 0.0; + temp.push_back(tempDist); + } + stdDev.push_back(temp); + } + + for (int thisIter = 0; thisIter < calcDistsTotals.size(); thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdDev.size(); i++) { + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist)); + } + } + } + + for (int i = 0; i < stdDev.size(); i++) { //finds average. + for (int j = 0; j < stdDev[i].size(); j++) { + stdDev[i][j].dist /= (float) calcDistsTotals.size(); + stdDev[i][j].dist = sqrt(stdDev[i][j].dist); + } + } + + return stdDev; + } + catch(exception& e) { + errorOut(e, "MothurOut", "getAverages"); + exit(1); + } +} + /**************************************************************************************************/ bool MothurOut::isContainingOnlyDigits(string input) { try{ diff --git a/mothurout.h b/mothurout.h index 84f9be3..ffdcaf6 100644 --- a/mothurout.h +++ b/mothurout.h @@ -69,7 +69,7 @@ class MothurOut { vector binLabelsInFile; vector currentBinLabels; string saveNextLabel, argv, sharedHeaderMode, groupMode; - bool printedHeaders, commandInputsConvertError; + bool printedHeaders, commandInputsConvertError, changedSeqNames; //functions from mothur.h //file operations @@ -102,6 +102,7 @@ class MothurOut { set readAccnos(string); int readAccnos(string, vector&); map readNames(string); + map readNames(string, unsigned long int&); int readTax(string, map&); int readNames(string, map&, map&); int readNames(string, map&); @@ -146,6 +147,7 @@ class MothurOut { string removeQuotes(string); string makeList(vector&); bool isSubset(vector, vector); //bigSet, subset + int checkName(string&); //math operation int factorial(int num); @@ -158,6 +160,10 @@ class MothurOut { vector getStandardDeviation(vector< vector >&); vector getStandardDeviation(vector< vector >&, vector&); vector getAverages(vector< vector >&); + vector< vector > getStandardDeviation(vector< vector< vector > >&); + vector< vector > getStandardDeviation(vector< vector< vector > >&, vector< vector >&); + vector< vector > getAverages(vector< vector< vector > >&, string); + vector< vector > getAverages(vector< vector< vector > >&); int control_pressed; bool executing, runParse, jumble, gui, mothurCalling, debug; @@ -252,6 +258,7 @@ class MothurOut { debug = false; sharedHeaderMode = ""; groupMode = "group"; + changedSeqNames = false; } ~MothurOut(); diff --git a/newcommandtemplate.cpp b/newcommandtemplate.cpp index b2426f5..3c893d8 100644 --- a/newcommandtemplate.cpp +++ b/newcommandtemplate.cpp @@ -183,7 +183,7 @@ NewCommand::NewCommand(string option) { ///variables for examples below that you will most likely want to put in the header for //use by the other class functions. - string phylipfile, columnfile, namefile, fastafile, sharedfile, method; + string phylipfile, columnfile, namefile, fastafile, sharedfile, method, countfile; int processors; bool useTiming, allLines; vector Estimators, Groups; @@ -304,10 +304,13 @@ NewCommand::NewCommand(string option) { //saved by mothur that is associated with the other files you are using as inputs. //You can do so by adding the files associated with the namefile to the files vector and then asking parser to check. //This saves our users headaches over file mismatches because they forgot to include the namefile, :) - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } + } diff --git a/primerdesigncommand.cpp b/primerdesigncommand.cpp new file mode 100644 index 0000000..f6ec7b5 --- /dev/null +++ b/primerdesigncommand.cpp @@ -0,0 +1,1239 @@ +// +// primerdesigncommand.cpp +// Mothur +// +// Created by Sarah Westcott on 1/18/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "primerdesigncommand.h" + +//********************************************************************************************************************** +vector PrimerDesignCommand::setParameters(){ + try { + CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","summary-list",false,true,true); parameters.push_back(plist); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","",false,true, true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter plength("length", "Number", "", "18", "", "", "","",false,false); parameters.push_back(plength); + CommandParameter pmintm("mintm", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmintm); + CommandParameter pmaxtm("maxtm", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxtm); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pprocessors); + CommandParameter potunumber("otunumber", "Number", "", "-1", "", "", "","",false,true,true); parameters.push_back(potunumber); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + CommandParameter pcutoff("cutoff", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pcutoff); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string PrimerDesignCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The primer.design allows you to design sequence fragments that are specific to particular OTUs.\n"; + helpString += "The primer.design command parameters are: list, fasta, name, count, otunumber, cutoff, length, pdiffs, mintm, maxtm, processors and label.\n"; + helpString += "The list parameter allows you to provide a list file and is required.\n"; + helpString += "The fasta parameter allows you to provide a fasta file and is required.\n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file associated with your fasta file.\n"; + helpString += "The label parameter is used to indicate the label you want to use from your list file.\n"; + helpString += "The otunumber parameter is used to indicate the otu you want to use from your list file. It is required.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The length parameter is used to indicate the length of the primer. The default is 18.\n"; + helpString += "The mintm parameter is used to indicate minimum melting temperature.\n"; + helpString += "The maxtm parameter is used to indicate maximum melting temperature.\n"; + helpString += "The processors parameter allows you to indicate the number of processors you want to use. Default=1.\n"; + helpString += "The cutoff parameter allows you set a percentage of sequences that support the base. For example: cutoff=97 would only return a sequence that only showed ambiguities for bases that were not supported by at least 97% of sequences.\n"; + helpString += "The primer.desing command should be in the following format: primer.design(list=yourListFile, fasta=yourFastaFile, name=yourNameFile)\n"; + helpString += "primer.design(list=final.an.list, fasta=final.fasta, name=final.names, label=0.03)\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string PrimerDesignCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "fasta") { pattern = "[filename],[distance],otu.cons.fasta"; } + else if (type == "summary") { pattern = "[filename],[distance],primer.summary"; } + else if (type == "list") { pattern = "[filename],pick,[extension]"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** +PrimerDesignCommand::PrimerDesignCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["list"] = tempOutNames; + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "PrimerDesignCommand"); + exit(1); + } +} +//********************************************************************************************************************** +PrimerDesignCommand::PrimerDesignCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + vector tempOutNames; + outputTypes["summary"] = tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["list"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } + + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("list"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["list"] = inputDir + it->second; } + } + } + + //check for parameters + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + else { m->setNameFile(namefile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + //get fastafile - it is required + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { fastafile = ""; abort=true; } + else if (fastafile == "not found") { + fastafile = m->getFastaFile(); + if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setFastaFile(fastafile); } + + //get listfile - it is required + listfile = validParameter.validFile(parameters, "list", true); + if (listfile == "not open") { listfile = ""; abort=true; } + else if (listfile == "not found") { + listfile = m->getListFile(); + if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; } + }else { m->setListFile(listfile); } + + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ + outputDir = m->hasPath(listfile); //if user entered a file with a path then preserve it + } + + string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "100"; } + m->mothurConvert(temp, cutoff); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "18"; } + m->mothurConvert(temp, length); + + temp = validParameter.validFile(parameters, "mintm", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, minTM); + + temp = validParameter.validFile(parameters, "maxtm", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, maxTM); + + temp = validParameter.validFile(parameters, "otunumber", false); if (temp == "not found") { temp = "-1"; } + m->mothurConvert(temp, otunumber); + if (otunumber < 1) { m->mothurOut("[ERROR]: You must provide an OTU number, aborting.\n"); abort = true; } + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); + + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your inputfile."); m->mothurOutEndLine(); label=""; } + + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } + } + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "PrimerDesignCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int PrimerDesignCommand::execute(){ + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + int start = time(NULL); + ////////////////////////////////////////////////////////////////////////////// + // get file inputs // + ////////////////////////////////////////////////////////////////////////////// + + //reads list file and selects the label the users specified or the first label + getListVector(); + if (otunumber > list->getNumBins()) { m->mothurOut("[ERROR]: You selected an OTU number larger than the number of OTUs you have in your list file, quitting.\n"); return 0; } + + map nameMap; + unsigned long int numSeqs; //used to sanity check the files. numSeqs = total seqs for namefile and uniques for count. + //list file should have all seqs if namefile was used to create it and only uniques in count file was used. + + if (namefile != "") { nameMap = m->readNames(namefile, numSeqs); } + else if (countfile != "") { nameMap = readCount(numSeqs); } + else { numSeqs = list->getNumSeqs(); } + + //sanity check + if (numSeqs != list->getNumSeqs()) { + if (namefile != "") { m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your name file contains " + toString(numSeqs) + " sequences, aborting. Do you have the correct files? Perhaps you forgot to include the name file when you clustered? \n"); } + else if (countfile != "") { + m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your count file contains " + toString(numSeqs) + " unique sequences, aborting. Do you have the correct files? Perhaps you forgot to include the count file when you clustered? \n"); + } + m->control_pressed = true; + } + + if (m->control_pressed) { delete list; return 0; } + + ////////////////////////////////////////////////////////////////////////////// + // process data // + ////////////////////////////////////////////////////////////////////////////// + m->mothurOut("\nFinding consensus sequences for each otu..."); cout.flush(); + + vector conSeqs = createProcessesConSeqs(nameMap, numSeqs); + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); + variables["[distance]"] = list->getLabel(); + string consFastaFile = getOutputFileName("fasta", variables); + outputNames.push_back(consFastaFile); outputTypes["fasta"].push_back(consFastaFile); + ofstream out; + m->openOutputFile(consFastaFile, out); + for (int i = 0; i < conSeqs.size(); i++) { conSeqs[i].printSequence(out); } + out.close(); + + m->mothurOut("Done.\n\n"); + + set primers = getPrimer(conSeqs[otunumber-1]); + + if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + string consSummaryFile = getOutputFileName("summary", variables); + outputNames.push_back(consSummaryFile); outputTypes["summary"].push_back(consSummaryFile); + ofstream outSum; + m->openOutputFile(consSummaryFile, outSum); + + outSum << "PrimerOtu: " << otunumber << " Members: " << list->get(otunumber-1) << endl << "Primers\tminTm\tmaxTm" << endl; + + //find min and max melting points + vector minTms; + vector maxTms; + string primerString = ""; + for (set::iterator it = primers.begin(); it != primers.end();) { + + double minTm, maxTm; + findMeltingPoint(*it, minTm, maxTm); + if ((minTM == -1) && (maxTM == -1)) { //user did not set min or max Tm so save this primer + minTms.push_back(minTm); + maxTms.push_back(maxTm); + outSum << *it << '\t' << minTm << '\t' << maxTm << endl; + it++; + }else if ((minTM == -1) && (maxTm <= maxTM)){ //user set max and no min, keep if below max + minTms.push_back(minTm); + maxTms.push_back(maxTm); + outSum << *it << '\t' << minTm << '\t' << maxTm << endl; + it++; + }else if ((maxTM == -1) && (minTm >= minTM)){ //user set min and no max, keep if above min + minTms.push_back(minTm); + maxTms.push_back(maxTm); + outSum << *it << '\t' << minTm << '\t' << maxTm << endl; + it++; + }else if ((maxTm <= maxTM) && (minTm >= minTM)) { //keep if above min and below max + minTms.push_back(minTm); + maxTms.push_back(maxTm); + outSum << *it << '\t' << minTm << '\t' << maxTm << endl; + it++; + }else { primers.erase(it++); } //erase because it didn't qualify + } + + outSum << "\nOTUNumber\tPrimer\tStart\tEnd\tLength\tMismatches\tminTm\tmaxTm\n"; + outSum.close(); + + //check each otu's conseq for each primer in otunumber + set otuToRemove = createProcesses(consSummaryFile, minTms, maxTms, primers, conSeqs); + + if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //print new list file + map mvariables; + mvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); + mvariables["[extension]"] = m->getExtension(listfile); + string newListFile = getOutputFileName("list", mvariables); + outputNames.push_back(newListFile); outputTypes["list"].push_back(newListFile); + ofstream outList; + m->openOutputFile(newListFile, outList); + + outList << list->getLabel() << '\t' << (list->getNumBins()-otuToRemove.size()) << '\t'; + for (int j = 0; j < list->getNumBins(); j++) { + if (m->control_pressed) { break; } + //good otus + if (otuToRemove.count(j) == 0) { + string bin = list->get(j); + if (bin != "") { outList << bin << '\t'; } + } + } + outList << endl; + outList.close(); + + if (m->control_pressed) { delete list; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + delete list; + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process " + toString(list->getNumBins()) + " OTUs.\n"); + + + //output files created by command + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "execute"); + exit(1); + } +} +//********************************************************************/ +//used http://www.biophp.org/minitools/melting_temperature/ as a reference to substitute degenerate bases +// in order to find the min and max Tm values. +//Tm = 64.9°C + 41°C x (number of G’s and C’s in the primer – 16.4)/N + +/* A = adenine + * C = cytosine + * G = guanine + * T = thymine + * R = G A (purine) + * Y = T C (pyrimidine) + * K = G T (keto) + * M = A C (amino) + * S = G C (strong bonds) + * W = A T (weak bonds) + * B = G T C (all but A) + * D = G A T (all but C) + * H = A C T (all but G) + * V = G C A (all but T) + * N = A G C T (any) */ + +int PrimerDesignCommand::findMeltingPoint(string primer, double& minTm, double& maxTm){ + try { + string minTmprimer = primer; + string maxTmprimer = primer; + + //find minimum Tm string substituting for degenerate bases + for (int i = 0; i < minTmprimer.length(); i++) { + minTmprimer[i] = toupper(minTmprimer[i]); + + if (minTmprimer[i] == 'Y') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'R') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'W') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'K') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'M') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'D') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'V') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'H') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'B') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'N') { minTmprimer[i] = 'A'; } + else if (minTmprimer[i] == 'S') { minTmprimer[i] = 'G'; } + } + + //find maximum Tm string substituting for degenerate bases + for (int i = 0; i < maxTmprimer.length(); i++) { + maxTmprimer[i] = toupper(maxTmprimer[i]); + + if (maxTmprimer[i] == 'Y') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'R') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'W') { maxTmprimer[i] = 'A'; } + else if (maxTmprimer[i] == 'K') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'M') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'D') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'V') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'H') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'B') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'N') { maxTmprimer[i] = 'G'; } + else if (maxTmprimer[i] == 'S') { maxTmprimer[i] = 'G'; } + } + + int numGC = 0; + for (int i = 0; i < minTmprimer.length(); i++) { + if (minTmprimer[i] == 'G') { numGC++; } + else if (minTmprimer[i] == 'C') { numGC++; } + } + + minTm = 64.9 + 41 * (numGC - 16.4) / (double) minTmprimer.length(); + + numGC = 0; + for (int i = 0; i < maxTmprimer.length(); i++) { + if (maxTmprimer[i] == 'G') { numGC++; } + else if (maxTmprimer[i] == 'C') { numGC++; } + } + + maxTm = 64.9 + 41 * (numGC - 16.4) / (double) maxTmprimer.length(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "findMeltingPoint"); + exit(1); + } +} +//********************************************************************/ +//search for a primer over the sequence string +bool PrimerDesignCommand::findPrimer(string rawSequence, string primer, vector& primerStart, vector& primerEnd, vector& mismatches){ + try { + bool foundAtLeastOne = false; //innocent til proven guilty + + //look for exact match + if(rawSequence.length() < primer.length()) { return false; } + + //search for primer + for (int j = 0; j < rawSequence.length()-length; j++){ + + if (m->control_pressed) { return foundAtLeastOne; } + + string rawChunk = rawSequence.substr(j, length); + + int numDiff = countDiffs(primer, rawChunk); + + if(numDiff <= pdiffs){ + primerStart.push_back(j); + primerEnd.push_back(j+length); + mismatches.push_back(numDiff); + foundAtLeastOne = true; + } + } + + return foundAtLeastOne; + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "findPrimer"); + exit(1); + } +} +//********************************************************************/ +//find all primers for the given sequence +set PrimerDesignCommand::getPrimer(Sequence primerSeq){ + try { + set primers; + + string rawSequence = primerSeq.getUnaligned(); + + for (int j = 0; j < rawSequence.length()-length; j++){ + if (m->control_pressed) { break; } + + string primer = rawSequence.substr(j, length); + primers.insert(primer); + } + + return primers; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getPrimer"); + exit(1); + } +} +/**************************************************************************************************/ +set PrimerDesignCommand::createProcesses(string newSummaryFile, vector& minTms, vector& maxTms, set& primers, vector& conSeqs) { + try { + + vector processIDS; + int process = 1; + set otusToRemove; + int numBinsProcessed = 0; + + //sanity check + int numBins = conSeqs.size(); + if (numBins < processors) { processors = numBins; } + + //divide the otus between the processors + vector lines; + int numOtusPerProcessor = numBins / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numOtusPerProcessor; + int endIndex = (i+1) * numOtusPerProcessor; + if(i == (processors - 1)){ endIndex = numBins; } + lines.push_back(linePair(startIndex, endIndex)); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + //clear old file because we append in driver + m->mothurRemove(newSummaryFile + toString(getpid()) + ".temp"); + + otusToRemove = driver(newSummaryFile + toString(getpid()) + ".temp", minTms, maxTms, primers, conSeqs, lines[process].start, lines[process].end, numBinsProcessed); + + string tempFile = toString(getpid()) + ".otus2Remove.temp"; + ofstream outTemp; + m->openOutputFile(tempFile, outTemp); + + outTemp << numBinsProcessed << endl; + outTemp << otusToRemove.size() << endl; + for (set::iterator it = otusToRemove.begin(); it != otusToRemove.end(); it++) { outTemp << *it << endl; } + outTemp.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + otusToRemove = driver(newSummaryFile, minTms, maxTms, primers, conSeqs, lines[0].start, lines[0].end, numBinsProcessed); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, intemp); + + int num; + intemp >> num; m->gobble(intemp); + if (num != (lines[i+1].end - lines[i+1].start)) { m->mothurOut("[ERROR]: process " + toString(processIDS[i]) + " did not complete processing all OTUs assigned to it, quitting.\n"); m->control_pressed = true; } + intemp >> num; m->gobble(intemp); + for (int k = 0; k < num; k++) { + int otu; + intemp >> otu; m->gobble(intemp); + otusToRemove.insert(otu); + } + intemp.close(); + m->mothurRemove(tempFile); + } + + + #else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the primerDesignData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; imothurRemove(newSummaryFile+extension); + + primerDesignData* tempPrimer = new primerDesignData((newSummaryFile+extension), m, lines[i].start, lines[i].end, minTms, maxTms, primers, conSeqs, pdiffs, otunumber, length, i); + pDataArray.push_back(tempPrimer); + processIDS.push_back(i); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i-1] = CreateThread(NULL, 0, MyPrimerThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]); + } + + + //using the main process as a worker saves time and memory + otusToRemove = driver(newSummaryFile, minTms, maxTms, primers, conSeqs, lines[0].start, lines[0].end, numBinsProcessed); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + for (set::iterator it = pDataArray[i]->otusToRemove.begin(); it != pDataArray[i]->otusToRemove.end(); it++) { + otusToRemove.insert(*it); + } + int num = pDataArray[i]->numBinsProcessed; + if (num != (lines[processIDS[i]].end - lines[processIDS[i]].start)) { m->mothurOut("[ERROR]: process " + toString(processIDS[i]) + " did not complete processing all OTUs assigned to it, quitting.\n"); m->control_pressed = true; } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + + //append output files + for(int i=0;iappendFiles((newSummaryFile + toString(processIDS[i]) + ".temp"), newSummaryFile); + m->mothurRemove((newSummaryFile + toString(processIDS[i]) + ".temp")); + } + + return otusToRemove; + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "createProcesses"); + exit(1); + } +} +//********************************************************************************************************************** +set PrimerDesignCommand::driver(string summaryFileName, vector& minTms, vector& maxTms, set& primers, vector& conSeqs, int start, int end, int& numBinsProcessed){ + try { + set otuToRemove; + + ofstream outSum; + m->openOutputFileAppend(summaryFileName, outSum); + + for (int i = start; i < end; i++) { + + if (m->control_pressed) { break; } + + if (i != (otunumber-1)) { + int primerIndex = 0; + for (set::iterator it = primers.begin(); it != primers.end(); it++) { + vector primerStarts; + vector primerEnds; + vector mismatches; + + bool found = findPrimer(conSeqs[i].getUnaligned(), (*it), primerStarts, primerEnds, mismatches); + + //if we found it report to the table + if (found) { + for (int j = 0; j < primerStarts.size(); j++) { + outSum << (i+1) << '\t' << *it << '\t' << primerStarts[j] << '\t' << primerEnds[j] << '\t' << length << '\t' << mismatches[j] << '\t' << minTms[primerIndex] << '\t' << maxTms[primerIndex] << endl; + } + otuToRemove.insert(i); + } + primerIndex++; + } + } + numBinsProcessed++; + } + outSum.close(); + + + return otuToRemove; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "driver"); + exit(1); + } +} +/**************************************************************************************************/ +vector< vector< vector > > PrimerDesignCommand::driverGetCounts(map& nameMap, unsigned long int& fastaCount, vector& otuCounts, unsigned long long& start, unsigned long long& end){ + try { + vector< vector< vector > > counts; + map seq2Bin; + alignedLength = 0; + + ifstream in; + m->openInputFile(fastafile, in); + + in.seekg(start); + + bool done = false; + fastaCount = 0; + + while (!done) { + if (m->control_pressed) { in.close(); return counts; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + if (fastaCount == 0) { alignedLength = seq.getAligned().length(); initializeCounts(counts, alignedLength, seq2Bin, nameMap, otuCounts); } + else if (alignedLength != seq.getAligned().length()) { + m->mothurOut("[ERROR]: your sequences are not all the same length. primer.design requires sequences to be aligned."); m->mothurOutEndLine(); m->control_pressed = true; break; + } + + int num = 1; + map::iterator itCount; + if (namefile != "") { + itCount = nameMap.find(seq.getName()); + if (itCount == nameMap.end()) { m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your name file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break; } + else { num = itCount->second; } + fastaCount+=num; + }else if (countfile != "") { + itCount = nameMap.find(seq.getName()); + if (itCount == nameMap.end()) { m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your count file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break; } + else { num = itCount->second; } + fastaCount++; + }else { + fastaCount++; + } + + //increment counts + itCount = seq2Bin.find(seq.getName()); + if (itCount == seq2Bin.end()) { + if ((namefile != "") || (countfile != "")) { + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your list file, aborting. Perhaps you forgot to include your name or count file while clustering.\n"); m->mothurOutEndLine(); m->control_pressed = true; break; + }else{ + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your list file, aborting."); m->mothurOutEndLine(); m->control_pressed = true; break; + } + }else { + otuCounts[itCount->second] += num; + string aligned = seq.getAligned(); + for (int i = 0; i < alignedLength; i++) { + char base = toupper(aligned[i]); + if (base == 'A') { counts[itCount->second][i][0]+=num; } + else if (base == 'T') { counts[itCount->second][i][1]+=num; } + else if (base == 'G') { counts[itCount->second][i][2]+=num; } + else if (base == 'C') { counts[itCount->second][i][3]+=num; } + else { counts[itCount->second][i][4]+=num; } + } + } + + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = in.tellg(); + if ((pos == -1) || (pos >= end)) { break; } +#else + if (in.eof()) { break; } +#endif + } + + in.close(); + + return counts; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "driverGetCounts"); + exit(1); + } +} +/**************************************************************************************************/ +vector PrimerDesignCommand::createProcessesConSeqs(map& nameMap, unsigned long int& numSeqs) { + try { + vector< vector< vector > > counts; + vector otuCounts; + vector processIDS; + int process = 1; + unsigned long int fastaCount = 0; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + + vector positions; + vector lines; + positions = m->divideFile(fastafile, processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(fastaLinePair(positions[i], positions[(i+1)])); } + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + counts = driverGetCounts(nameMap, fastaCount, otuCounts, lines[process].start, lines[process].end); + + string tempFile = toString(getpid()) + ".cons_counts.temp"; + ofstream outTemp; + m->openOutputFile(tempFile, outTemp); + + outTemp << fastaCount << endl; + //pass counts + outTemp << counts.size() << endl; + for (int i = 0; i < counts.size(); i++) { + outTemp << counts[i].size() << endl; + for (int j = 0; j < counts[i].size(); j++) { + for (int k = 0; k < 5; k++) { outTemp << counts[i][j][k] << '\t'; } + outTemp << endl; + } + } + //pass otuCounts + outTemp << otuCounts.size() << endl; + for (int i = 0; i < otuCounts.size(); i++) { outTemp << otuCounts[i] << '\t'; } + outTemp << endl; + outTemp.close(); + + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + counts = driverGetCounts(nameMap, fastaCount, otuCounts, lines[0].start, lines[0].end); + + //force parent to wait until all the processes are done + for (int i=0;iopenInputFile(tempFile, intemp); + + unsigned long int num; + intemp >> num; m->gobble(intemp); fastaCount += num; + intemp >> num; m->gobble(intemp); + if (num != counts.size()) { m->mothurOut("[ERROR]: " + tempFile + " was not built correctly by the child process, quitting.\n"); m->control_pressed = true; } + else { + //read counts + for (int k = 0; k < num; k++) { + int alength; + intemp >> alength; m->gobble(intemp); + if (alength != alignedLength) { m->mothurOut("[ERROR]: your sequences are not all the same length. primer.design requires sequences to be aligned."); m->mothurOutEndLine(); m->control_pressed = true; } + else { + for (int j = 0; j < alength; j++) { + for (int l = 0; l < 5; l++) { unsigned int numTemp; intemp >> numTemp; m->gobble(intemp); counts[k][j][l] += numTemp; } + } + } + } + //read otuCounts + intemp >> num; m->gobble(intemp); + for (int k = 0; k < num; k++) { + unsigned int numTemp; intemp >> numTemp; m->gobble(intemp); + otuCounts[k] += numTemp; + } + } + intemp.close(); + m->mothurRemove(tempFile); + } + + +#else + counts = driverGetCounts(nameMap, fastaCount, otuCounts, 0, 1000); +#endif + + //you will have a nameMap error if there is a namefile or countfile, but if those aren't given we want to make sure the fasta and list file match. + if (fastaCount != numSeqs) { + if ((namefile == "") && (countfile == "")) { m->mothurOut("[ERROR]: Your list file contains " + toString(list->getNumSeqs()) + " sequences, and your fasta file contains " + toString(fastaCount) + " sequences, aborting. Do you have the correct files? Perhaps you forgot to include the name or count file? \n"); } + m->control_pressed = true; + } + + vector conSeqs; + + if (m->control_pressed) { return conSeqs; } + + //build consensus seqs + string snumBins = toString(counts.size()); + for (int i = 0; i < counts.size(); i++) { + if (m->control_pressed) { break; } + + string otuLabel = "Otu"; + string sbinNumber = toString(i+1); + if (sbinNumber.length() < snumBins.length()) { + int diff = snumBins.length() - sbinNumber.length(); + for (int h = 0; h < diff; h++) { otuLabel += "0"; } + } + otuLabel += sbinNumber; + + string cons = ""; + for (int j = 0; j < counts[i].size(); j++) { + cons += getBase(counts[i][j], otuCounts[i]); + } + Sequence consSeq(otuLabel, cons); + conSeqs.push_back(consSeq); + } + + if (m->control_pressed) { conSeqs.clear(); return conSeqs; } + + return conSeqs; + + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "createProcessesConSeqs"); + exit(1); + } +} +//*************************************************************************************************************** + +char PrimerDesignCommand::getBase(vector counts, int size){ //A,T,G,C,Gap + try{ + /* A = adenine + * C = cytosine + * G = guanine + * T = thymine + * R = G A (purine) + * Y = T C (pyrimidine) + * K = G T (keto) + * M = A C (amino) + * S = G C (strong bonds) + * W = A T (weak bonds) + * B = G T C (all but A) + * D = G A T (all but C) + * H = A C T (all but G) + * V = G C A (all but T) + * N = A G C T (any) */ + + char conBase = 'N'; + + //zero out counts that don't make the cutoff + float percentage = (100.0 - cutoff) / 100.0; + + for (int i = 0; i < counts.size(); i++) { + float countPercentage = counts[i] / (float) size; + if (countPercentage < percentage) { counts[i] = 0; } + } + + //any + if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'n'; } + //any no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'N'; } + //all but T + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'v'; } + //all but T no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'V'; } + //all but G + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'h'; } + //all but G no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'H'; } + //all but C + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'd'; } + //all but C no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'D'; } + //all but A + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'b'; } + //all but A no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'B'; } + //W = A T (weak bonds) + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'w'; } + //W = A T (weak bonds) no gap + else if ((counts[0] != 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'W'; } + //S = G C (strong bonds) + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 's'; } + //S = G C (strong bonds) no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'S'; } + //M = A C (amino) + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'm'; } + //M = A C (amino) no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'M'; } + //K = G T (keto) + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'k'; } + //K = G T (keto) no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'K'; } + //Y = T C (pyrimidine) + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'y'; } + //Y = T C (pyrimidine) no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'Y'; } + //R = G A (purine) + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'r'; } + //R = G A (purine) no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'R'; } + //only A + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'a'; } + //only A no gap + else if ((counts[0] != 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'A'; } + //only T + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 't'; } + //only T no gap + else if ((counts[0] == 0) && (counts[1] != 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'T'; } + //only G + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = 'g'; } + //only G no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] != 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'G'; } + //only C + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] != 0)) { conBase = 'c'; } + //only C no gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] != 0) && (counts[4] == 0)) { conBase = 'C'; } + //only gap + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] != 0)) { conBase = '-'; } + //cutoff removed all counts + else if ((counts[0] == 0) && (counts[1] == 0) && (counts[2] == 0) && (counts[3] == 0) && (counts[4] == 0)) { conBase = 'N'; } + else{ m->mothurOut("[ERROR]: cannot find consensus base."); m->mothurOutEndLine(); } + + return conBase; + + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getBase"); + exit(1); + } +} + +//********************************************************************************************************************** +int PrimerDesignCommand::initializeCounts(vector< vector< vector > >& counts, int length, map& seq2Bin, map& nameMap, vector& otuCounts){ + try { + counts.clear(); + otuCounts.clear(); + seq2Bin.clear(); + + //vector< vector< vector > > counts - otu < spot_in_alignment < counts_for_A,T,G,C,Gap > > > + for (int i = 0; i < list->getNumBins(); i++) { + string binNames = list->get(i); + vector names; + m->splitAtComma(binNames, names); + otuCounts.push_back(0); + + //lets be smart and only map the unique names if a name or count file was given to save search time and memory + if ((namefile != "") || (countfile != "")) { + for (int j = 0; j < names.size(); j++) { + map::iterator itNames = nameMap.find(names[j]); + if (itNames != nameMap.end()) { //add name because its a unique one + seq2Bin[names[j]] = i; + } + } + }else { //map everyone + for (int j = 0; j < names.size(); j++) { seq2Bin[names[j]] = i; } + } + + vector temp; temp.resize(5, 0); //A,T,G,C,Gap + vector< vector > temp2; + for (int j = 0; j < length; j++) { + temp2.push_back(temp); + } + counts.push_back(temp2); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "initializeCounts"); + exit(1); + } +} +//********************************************************************************************************************** +map PrimerDesignCommand::readCount(unsigned long int& numSeqs){ + try { + map nameMap; + + CountTable ct; + ct.readTable(countfile); + vector namesOfSeqs = ct.getNamesOfSeqs(); + numSeqs = ct.getNumUniqueSeqs(); + + for (int i = 0; i < namesOfSeqs.size(); i++) { + if (m->control_pressed) { break; } + + nameMap[namesOfSeqs[i]] = ct.getNumSeqs(namesOfSeqs[i]); + } + + return nameMap; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "readCount"); + exit(1); + } +} +//********************************************************************************************************************** +int PrimerDesignCommand::getListVector(){ + try { + InputData input(listfile, "list"); + list = input.getListVector(); + string lastLabel = list->getLabel(); + + if (label == "") { label = lastLabel; return 0; } + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set labels; labels.insert(label); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((list != NULL) && (userLabels.size() != 0)) { + if (m->control_pressed) { return 0; } + + if(labels.count(list->getLabel()) == 1){ + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + break; + } + + if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = list->getLabel(); + + delete list; + list = input.getListVector(lastLabel); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + + //restore real lastlabel to save below + list->setLabel(saveLabel); + break; + } + + lastLabel = list->getLabel(); + + //get next line to process + //prevent memory leak + delete list; + list = input.getListVector(); + } + + + if (m->control_pressed) { return 0; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + delete list; + list = input.getListVector(lastLabel); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PrimerDesignCommand", "getListVector"); + exit(1); + } +} +//********************************************************************/ +/* A = adenine + * C = cytosine + * G = guanine + * T = thymine + * R = G A (purine) + * Y = T C (pyrimidine) + * K = G T (keto) + * M = A C (amino) + * S = G C (strong bonds) + * W = A T (weak bonds) + * B = G T C (all but A) + * D = G A T (all but C) + * H = A C T (all but G) + * V = G C A (all but T) + * N = A G C T (any) */ +int PrimerDesignCommand::countDiffs(string oligo, string seq){ + try { + + int length = oligo.length(); + int countDiffs = 0; + + for(int i=0;ierrorOut(e, "PrimerDesignCommand", "countDiffs"); + exit(1); + } +} +//********************************************************************************************************************** + + + diff --git a/primerdesigncommand.h b/primerdesigncommand.h new file mode 100644 index 0000000..78c12de --- /dev/null +++ b/primerdesigncommand.h @@ -0,0 +1,219 @@ +// +// primerdesigncommand.h +// Mothur +// +// Created by Sarah Westcott on 1/18/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_primerdesigncommand_h +#define Mothur_primerdesigncommand_h + +#include "command.hpp" +#include "listvector.hpp" +#include "inputdata.h" +#include "sequence.hpp" +#include "alignment.hpp" +#include "needlemanoverlap.hpp" + +/**************************************************************************************************/ + +class PrimerDesignCommand : public Command { +public: + PrimerDesignCommand(string); + PrimerDesignCommand(); + ~PrimerDesignCommand(){} + + vector setParameters(); + string getCommandName() { return "primer.design"; } + string getCommandCategory() { return "OTU-Based Approaches"; } + + string getOutputPattern(string); + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Primer.design"; } + string getDescription() { return "design sequence fragments that are specific to particular OTUs"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + struct fastaLinePair { + unsigned long long start; + unsigned long long end; + fastaLinePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} + }; + + bool abort, allLines, large; + int cutoff, pdiffs, length, otunumber, processors, alignedLength; + string outputDir, listfile, namefile, countfile, fastafile, label; + double minTM, maxTM; + ListVector* list; + vector outputNames; + + int initializeCounts(vector< vector< vector > >& counts, int length, map&, map&, vector&); + map readCount(unsigned long int&); + char getBase(vector counts, int size); + int getListVector(); + int countDiffs(string, string); + set getPrimer(Sequence); + bool findPrimer(string, string, vector&, vector&, vector&); + int findMeltingPoint(string primer, double&, double&); + + set createProcesses(string, vector&, vector&, set&, vector&); + set driver(string, vector&, vector&, set&, vector&, int, int, int&); + vector< vector< vector > > driverGetCounts(map&, unsigned long int&, vector&, unsigned long long&, unsigned long long&); + vector createProcessesConSeqs(map&, unsigned long int&); + +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct primerDesignData { + string summaryFileName; + MothurOut* m; + int start; + int end; + int pdiffs, threadID, otunumber, length; + set primers; + vector minTms, maxTms; + set otusToRemove; + vector consSeqs; + int numBinsProcessed; + + primerDesignData(){} + primerDesignData(string sf, MothurOut* mout, int st, int en, vector min, vector max, set pri, vector seqs, int d, int otun, int l, int tid) { + summaryFileName = sf; + m = mout; + start = st; + end = en; + pdiffs = d; + minTms = min; + maxTms = max; + primers = pri; + consSeqs = seqs; + otunumber = otun; + length = l; + threadID = tid; + numBinsProcessed = 0; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyPrimerThreadFunction(LPVOID lpParam){ + primerDesignData* pDataArray; + pDataArray = (primerDesignData*)lpParam; + + try { + ofstream outSum; + pDataArray->m->openOutputFileAppend(pDataArray->summaryFileName, outSum); + + for (int i = pDataArray->start; i < pDataArray->end; i++) { + + if (pDataArray->m->control_pressed) { break; } + + if (i != (pDataArray->otunumber-1)) { + int primerIndex = 0; + for (set::iterator it = pDataArray->primers.begin(); it != pDataArray->primers.end(); it++) { + vector primerStarts; + vector primerEnds; + vector mismatches; + + //bool found = findPrimer(conSeqs[i].getUnaligned(), (*it), primerStarts, primerEnds, mismatches); + /////////////////////////////////////////////////////////////////////////////////////////////////// + bool found = false; //innocent til proven guilty + + string rawSequence = pDataArray->consSeqs[i].getUnaligned(); + string primer = *it; + + //look for exact match + if(rawSequence.length() < primer.length()) { found = false; } + else { + //search for primer + for (int j = 0; j < rawSequence.length()-pDataArray->length; j++){ + + if (pDataArray->m->control_pressed) { found = false; break; } + + string rawChunk = rawSequence.substr(j, pDataArray->length); + + //int numDiff = countDiffs(primer, rawchuck); + /////////////////////////////////////////////////////////////////////// + int numDiff = 0; + string oligo = primer; + string seq = rawChunk; + + for(int k=0;klength;k++){ + + oligo[k] = toupper(oligo[k]); + seq[k] = toupper(seq[k]); + + if(oligo[k] != seq[k]){ + + if((oligo[k] == 'N' || oligo[k] == 'I') && (seq[k] == 'N')) { numDiff++; } + else if(oligo[k] == 'R' && (seq[k] != 'A' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'Y' && (seq[k] != 'C' && seq[k] != 'T')) { numDiff++; } + else if(oligo[k] == 'M' && (seq[k] != 'C' && seq[k] != 'A')) { numDiff++; } + else if(oligo[k] == 'K' && (seq[k] != 'T' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'W' && (seq[k] != 'T' && seq[k] != 'A')) { numDiff++; } + else if(oligo[k] == 'S' && (seq[k] != 'C' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'B' && (seq[k] != 'C' && seq[k] != 'T' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'D' && (seq[k] != 'A' && seq[k] != 'T' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'H' && (seq[k] != 'A' && seq[k] != 'T' && seq[k] != 'C')) { numDiff++; } + else if(oligo[k] == 'V' && (seq[k] != 'A' && seq[k] != 'C' && seq[k] != 'G')) { numDiff++; } + else if(oligo[k] == 'A' && (seq[k] != 'A' && seq[k] != 'M' && seq[k] != 'R' && seq[k] != 'W' && seq[k] != 'D' && seq[k] != 'H' && seq[k] != 'V')) { numDiff++; } + else if(oligo[k] == 'C' && (seq[k] != 'C' && seq[k] != 'Y' && seq[k] != 'M' && seq[k] != 'S' && seq[k] != 'B' && seq[k] != 'H' && seq[k] != 'V')) { numDiff++; } + else if(oligo[k] == 'G' && (seq[k] != 'G' && seq[k] != 'R' && seq[k] != 'K' && seq[k] != 'S' && seq[k] != 'B' && seq[k] != 'D' && seq[k] != 'V')) { numDiff++; } + else if(oligo[k] == 'T' && (seq[k] != 'T' && seq[k] != 'Y' && seq[k] != 'K' && seq[k] != 'W' && seq[k] != 'B' && seq[k] != 'D' && seq[k] != 'H')) { numDiff++; } + else if((oligo[k] == '.' || oligo[k] == '-')) { numDiff++; } + } + } + /////////////////////////////////////////////////////////////////////// + + if(numDiff <= pDataArray->pdiffs){ + primerStarts.push_back(j); + primerEnds.push_back(j+pDataArray->length); + mismatches.push_back(numDiff); + found = true; + } + } + } + /////////////////////////////////////////////////////////////////////////////////////////////////// + + //if we found it report to the table + if (found) { + for (int j = 0; j < primerStarts.size(); j++) { + outSum << (i+1) << '\t' << *it << '\t' << primerStarts[j] << '\t' << primerEnds[j] << '\t' << pDataArray->length << '\t' << mismatches[j] << '\t' << pDataArray->minTms[primerIndex] << '\t' << pDataArray->maxTms[primerIndex] << endl; + } + pDataArray->otusToRemove.insert(i); + } + primerIndex++; + } + } + pDataArray->numBinsProcessed++; + } + outSum.close(); + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "PrimerDesignCommand", "MyPrimerThreadFunction"); + exit(1); + } +} +#endif + +/**************************************************************************************************/ + + + + + +#endif diff --git a/qualityscores.cpp b/qualityscores.cpp index 0b7bd06..4998b3d 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -30,89 +30,37 @@ QualityScores::QualityScores(ifstream& qFile){ try { m = MothurOut::getInstance(); - - seqName = ""; + int score; + seqName = getSequenceName(qFile); + + if (!m->control_pressed) { + string qScoreString = m->getline(qFile); + //cout << qScoreString << endl; + while(qFile.peek() != '>' && qFile.peek() != EOF){ + if (m->control_pressed) { break; } + string temp = m->getline(qFile); + //cout << temp << endl; + qScoreString += ' ' + temp; + } + //cout << "done reading " << endl; + istringstream qScoreStringStream(qScoreString); + int count = 0; + while(!qScoreStringStream.eof()){ + if (m->control_pressed) { break; } + string temp; + qScoreStringStream >> temp; m->gobble(qScoreStringStream); + + //check temp to make sure its a number + if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; } + convert(temp, score); + + //cout << count << '\t' << score << endl; + qScores.push_back(score); + count++; + } + } - qFile >> seqName; - m->getline(qFile); - //cout << seqName << endl; - if (seqName == "") { - m->mothurOut("Error reading quality file, name blank at position, " + toString(qFile.tellg())); - m->mothurOutEndLine(); - } - else{ - seqName = seqName.substr(1); - } - - string qScoreString = m->getline(qFile); - //cout << qScoreString << endl; - while(qFile.peek() != '>' && qFile.peek() != EOF){ - if (m->control_pressed) { break; } - string temp = m->getline(qFile); - //cout << temp << endl; - qScoreString += ' ' + temp; - } - //cout << "done reading " << endl; - istringstream qScoreStringStream(qScoreString); - int count = 0; - while(!qScoreStringStream.eof()){ - if (m->control_pressed) { break; } - string temp; - qScoreStringStream >> temp; m->gobble(qScoreStringStream); - - //check temp to make sure its a number - if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; } - convert(temp, score); - - //cout << count << '\t' << score << endl; - qScores.push_back(score); - count++; - } - //qScores.pop_back(); - -// string scores = ""; -// -// while(!qFile.eof()){ -// -// qFile >> seqName; -// -// //get name -// if (seqName.length() != 0) { -// seqName = seqName.substr(1); -// while (!qFile.eof()) { -// char c = qFile.get(); -// //gobble junk on line -// if (c == 10 || c == 13){ break; } -// } -// m->gobble(qFile); -// } -// -// //get scores -// while(qFile){ -// char letter=qFile.get(); -// if((letter == '>')){ qFile.putback(letter); break; } -// else if (isprint(letter)) { scores += letter; } -// } -// m->gobble(qFile); -// -// break; -// } -// -// //convert scores string to qScores -// istringstream qScoreStringStream(scores); -// -// int score; -// while(!qScoreStringStream.eof()){ -// -// if (m->control_pressed) { break; } -// -// qScoreStringStream >> score; -// qScores.push_back(score); -// } -// -// qScores.pop_back(); - seqLength = qScores.size(); //cout << "seqlength = " << seqLength << '\t' << count << endl; @@ -123,7 +71,46 @@ QualityScores::QualityScores(ifstream& qFile){ } } - +//******************************************************************************************************************** +string QualityScores::getSequenceName(ifstream& qFile) { + try { + string name = ""; + + qFile >> name; + m->getline(qFile); + + if (name.length() != 0) { + + name = name.substr(1); + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } + + }else{ m->mothurOut("Error in reading your qfile, at position " + toString(qFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; } + + return name; + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "getSequenceName"); + exit(1); + } +} +//******************************************************************************************************************** +void QualityScores::setName(string name) { + try { + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } + + seqName = name; + } + catch(exception& e) { + m->errorOut(e, "QualityScores", "setName"); + exit(1); + } +} /**************************************************************************************************/ string QualityScores::getName(){ diff --git a/qualityscores.h b/qualityscores.h index 77c3ee0..8769973 100644 --- a/qualityscores.h +++ b/qualityscores.h @@ -36,7 +36,7 @@ public: void updateQScoreErrorMap(map >&, string, int, int, int); void updateForwardMap(vector >&, int, int, int); void updateReverseMap(vector >&, int, int, int); - void setName(string n) { seqName = n; } + void setName(string n); void setScores(vector qs) { qScores = qs; seqLength = qScores.size(); } @@ -48,6 +48,8 @@ private: string seqName; int seqLength; + + string getSequenceName(ifstream&); }; /**************************************************************************************************/ diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 5a9c0c8..2524e65 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -588,7 +588,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ while(!inputNames.eof()){ if (m->control_pressed) { goodNameOut.close(); inputNames.close(); m->mothurRemove(goodNameFile); return 0; } - inputNames >> seqName >> seqList; + inputNames >> seqName; m->gobble(inputNames); inputNames >> seqList; it = badSeqNames.find(seqName); if(it != badSeqNames.end()){ @@ -636,7 +636,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ while(!inputGroups.eof()){ if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodNameFile); m->mothurRemove(goodGroupFile); return 0; } - inputGroups >> seqName >> group; + inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; it = badSeqGroups.find(seqName); @@ -970,7 +970,7 @@ int ScreenSeqsCommand::screenGroupFile(set badSeqNames){ while(!inputGroups.eof()){ if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; } - inputGroups >> seqName >> group; + inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; it = badSeqNames.find(seqName); if(it != badSeqNames.end()){ @@ -1157,7 +1157,7 @@ int ScreenSeqsCommand::screenTaxonomy(set badSeqNames){ while(!input.eof()){ if (m->control_pressed) { goodTaxOut.close(); input.close(); m->mothurRemove(goodTaxFile); return 0; } - input >> seqName >> tax; + input >> seqName; m->gobble(input); input >> tax; it = badSeqNames.find(seqName); if(it != badSeqNames.end()){ badSeqNames.erase(it); } diff --git a/sequence.cpp b/sequence.cpp index 96662bc..93ecb70 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -20,6 +20,10 @@ Sequence::Sequence(string newName, string sequence) { m = MothurOut::getInstance(); initialize(); name = newName; + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } //setUnaligned removes any gap characters for us setUnaligned(sequence); @@ -36,6 +40,10 @@ Sequence::Sequence(string newName, string sequence, string justUnAligned) { m = MothurOut::getInstance(); initialize(); name = newName; + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } //setUnaligned removes any gap characters for us setUnaligned(sequence); @@ -53,11 +61,9 @@ Sequence::Sequence(istringstream& fastaString){ m = MothurOut::getInstance(); initialize(); - fastaString >> name; - - if (name.length() != 0) { + name = getSequenceName(fastaString); - name = name.substr(1); + if (!m->control_pressed) { string sequence; //read comments @@ -84,8 +90,7 @@ Sequence::Sequence(istringstream& fastaString){ setUnaligned(sequence); if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); } - - }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } } catch(exception& e) { @@ -100,11 +105,9 @@ Sequence::Sequence(istringstream& fastaString, string JustUnaligned){ m = MothurOut::getInstance(); initialize(); - fastaString >> name; - - if (name.length() != 0) { + name = getSequenceName(fastaString); - name = name.substr(1); + if (!m->control_pressed) { string sequence; //read comments @@ -131,7 +134,7 @@ Sequence::Sequence(istringstream& fastaString, string JustUnaligned){ if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); } - }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaString.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } } catch(exception& e) { @@ -147,11 +150,9 @@ Sequence::Sequence(ifstream& fastaFile){ try { m = MothurOut::getInstance(); initialize(); - fastaFile >> name; - - if (name.length() != 0) { + name = getSequenceName(fastaFile); - name = name.substr(1); + if (!m->control_pressed) { string sequence; @@ -181,7 +182,7 @@ Sequence::Sequence(ifstream& fastaFile){ if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); } - }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } } catch(exception& e) { @@ -195,13 +196,11 @@ Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){ try { m = MothurOut::getInstance(); initialize(); - fastaFile >> name; extraInfo = ""; - if (name.length() != 0) { - - name = name.substr(1); - + name = getSequenceName(fastaFile); + + if (!m->control_pressed) { string sequence; //read comments @@ -233,8 +232,7 @@ Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){ setUnaligned(sequence); if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); } - - }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } } catch(exception& e) { @@ -248,10 +246,9 @@ Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){ try { m = MothurOut::getInstance(); initialize(); - fastaFile >> name; + name = getSequenceName(fastaFile); - if (name.length() != 0) { - name = name.substr(1); + if (!m->control_pressed) { string sequence; //read comments @@ -279,7 +276,7 @@ Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){ if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); } - }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); } + } } catch(exception& e) { @@ -287,7 +284,54 @@ Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){ exit(1); } } - +//******************************************************************************************************************** +string Sequence::getSequenceName(ifstream& fastaFile) { + try { + string name = ""; + + fastaFile >> name; + + if (name.length() != 0) { + + name = name.substr(1); + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } + + }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; } + + return name; + } + catch(exception& e) { + m->errorOut(e, "Sequence", "getSequenceName"); + exit(1); + } +} +//******************************************************************************************************************** +string Sequence::getSequenceName(istringstream& fastaFile) { + try { + string name = ""; + + fastaFile >> name; + + if (name.length() != 0) { + + name = name.substr(1); + + for (int i = 0; i < name.length(); i++) { + if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } + } + + }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); m->control_pressed = true; } + + return name; + } + catch(exception& e) { + m->errorOut(e, "Sequence", "getSequenceName"); + exit(1); + } +} //******************************************************************************************************************** string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) { try { diff --git a/sequence.hpp b/sequence.hpp index db4c4f3..312e49d 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -69,6 +69,8 @@ private: string getCommentString(ifstream&); string getSequenceString(istringstream&, int&); string getCommentString(istringstream&); + string getSequenceName(ifstream&); + string getSequenceName(istringstream&); string name; string unaligned; string aligned; diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index fdec2ec..9893ea7 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -809,58 +809,10 @@ int SummarySharedCommand::process(vector thisLookup, string if (iters != 0) { //we need to find the average distance and standard deviation for each groups distance - - vector< vector > calcAverages; calcAverages.resize(sumCalculators.size()); - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - calcAverages[i].resize(calcDistsTotals[0][i].size()); - - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].seq1 = calcDists[i][j].seq1; - calcAverages[i][j].seq2 = calcDists[i][j].seq2; - calcAverages[i][j].dist = 0.0; - } - } - - for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; - } - } - } - - for (int i = 0; i < calcAverages.size(); i++) { //finds average. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist /= (float) iters; - } - } + vector< vector > calcAverages = m->getAverages(calcDistsTotals); //find standard deviation - vector< vector > stdDev; stdDev.resize(sumCalculators.size()); - for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero. - stdDev[i].resize(calcDistsTotals[0][i].size()); - - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].seq1 = calcDists[i][j].seq1; - stdDev[i][j].seq2 = calcDists[i][j].seq2; - stdDev[i][j].dist = 0.0; - } - } - - for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each - for (int i = 0; i < stdDev.size(); i++) { - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist)); - } - } - } - - for (int i = 0; i < stdDev.size(); i++) { //finds average. - for (int j = 0; j < stdDev[i].size(); j++) { - stdDev[i][j].dist /= (float) iters; - stdDev[i][j].dist = sqrt(stdDev[i][j].dist); - } - } + vector< vector > stdDev = m->getStandardDeviation(calcDistsTotals, calcAverages); //print results for (int i = 0; i < calcDists.size(); i++) { diff --git a/treegroupscommand.cpp b/treegroupscommand.cpp index 0d01459..7803d89 100644 --- a/treegroupscommand.cpp +++ b/treegroupscommand.cpp @@ -917,31 +917,7 @@ int TreeGroupCommand::process(vector thisLookup) { if (iters != 1) { //we need to find the average distance and standard deviation for each groups distance - - vector< vector > calcAverages; calcAverages.resize(treeCalculators.size()); - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - calcAverages[i].resize(calcDistsTotals[0][i].size()); - - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].seq1 = calcDists[i][j].seq1; - calcAverages[i][j].seq2 = calcDists[i][j].seq2; - calcAverages[i][j].dist = 0.0; - } - } - - for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator - for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist; - } - } - } - - for (int i = 0; i < calcAverages.size(); i++) { //finds average. - for (int j = 0; j < calcAverages[i].size(); j++) { - calcAverages[i][j].dist /= (float) iters; - } - } + vector< vector > calcAverages = m->getAverages(calcDistsTotals); //create average tree for each calc for (int i = 0; i < calcDists.size(); i++) { diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 19a2ece..ab8afd0 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -482,39 +482,10 @@ int UnifracUnweightedCommand::getAverageSTDMatrices(vector< vector >& di try { //we need to find the average distance and standard deviation for each groups distance //finds sum - vector averages; //averages.resize(numComp, 0.0); - for (int i = 0; i < numComp; i++) { averages.push_back(0.0); } - - if (m->debug) { m->mothurOut("[DEBUG]: numcomparisons = " + toString(numComp) + ", subsampleIters = " + toString(subsampleIters) + "\n"); } - - for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { - for (int i = 0; i < dists[thisIter].size(); i++) { - averages[i] += dists[thisIter][i]; - } - } - - if (m->debug) { m->mothurOut("[DEBUG]: numcomparisons = " + toString(numComp) + ", subsampleIters = " + toString(subsampleIters) + "\n"); } - - //finds average. - for (int i = 0; i < averages.size(); i++) { - averages[i] /= (float) subsampleIters; - if (m->debug) { m->mothurOut("[DEBUG]: i = " + toString(i) + ", averages[i] = " + toString(averages[i]) + "\n"); } - } + vector averages = m->getAverages(dists); //find standard deviation - vector stdDev; //stdDev.resize(numComp, 0.0); - for (int i = 0; i < numComp; i++) { stdDev.push_back(0.0); } - - for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each - for (int j = 0; j < dists[thisIter].size(); j++) { - stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j])); - } - } - for (int i = 0; i < stdDev.size(); i++) { - stdDev[i] /= (float) subsampleIters; - stdDev[i] = sqrt(stdDev[i]); - if (m->debug) { m->mothurOut("[DEBUG]: i = " + toString(i) + ", stdDev[i] = " + toString(stdDev[i]) + "\n"); } - } + vector stdDev = m->getStandardDeviation(dists, averages); //make matrix with scores in it vector< vector > avedists; //avedists.resize(m->getNumGroups()); diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 47adc9a..94ae125 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -465,40 +465,28 @@ int UnifracWeightedCommand::getAverageSTDMatrices(vector< vector >& dist //we need to find the average distance and standard deviation for each groups distance //finds sum - vector averages; averages.resize(numComp, 0); - for (int thisIter = 0; thisIter < subsampleIters; thisIter++) { - for (int i = 0; i < dists[thisIter].size(); i++) { - averages[i] += dists[thisIter][i]; - } - } - - //finds average. - for (int i = 0; i < averages.size(); i++) { averages[i] /= (float) subsampleIters; } + vector averages = m->getAverages(dists); //find standard deviation - vector stdDev; stdDev.resize(numComp, 0); - - for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each - for (int j = 0; j < dists[thisIter].size(); j++) { - stdDev[j] += ((dists[thisIter][j] - averages[j]) * (dists[thisIter][j] - averages[j])); - } - } - for (int i = 0; i < stdDev.size(); i++) { - stdDev[i] /= (float) subsampleIters; - stdDev[i] = sqrt(stdDev[i]); - } + vector stdDev = m->getStandardDeviation(dists, averages); //make matrix with scores in it - vector< vector > avedists; avedists.resize(m->getNumGroups()); + vector< vector > avedists; //avedists.resize(m->getNumGroups()); for (int i = 0; i < m->getNumGroups(); i++) { - avedists[i].resize(m->getNumGroups(), 0.0); + vector temp; + for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); } + avedists.push_back(temp); } //make matrix with scores in it - vector< vector > stddists; stddists.resize(m->getNumGroups()); + vector< vector > stddists; //stddists.resize(m->getNumGroups()); for (int i = 0; i < m->getNumGroups(); i++) { - stddists[i].resize(m->getNumGroups(), 0.0); + vector temp; + for (int j = 0; j < m->getNumGroups(); j++) { temp.push_back(0.0); } + //stddists[i].resize(m->getNumGroups(), 0.0); + stddists.push_back(temp); } + //flip it so you can print it int count = 0; diff --git a/venn.cpp b/venn.cpp index 2824ca8..66dbb8e 100644 --- a/venn.cpp +++ b/venn.cpp @@ -161,7 +161,7 @@ vector Venn::getPic(vector lookup, vectorgetName() == "sharedsobs") { singleCalc = new Sobs(); - if (sharedOtus) { + if (sharedOtus && (labels.size() != 0)) { string filenameShared = outputDir + m->getRootName(m->getSimpleName(inputfile)) + lookup[0]->getLabel() + "." + vCalcs[i]->getName() + ".sharedotus"; outputNames.push_back(filenameShared); @@ -482,7 +482,7 @@ vector Venn::getPic(vector lookup, vector labels; vector sharedab = vCalcs[i]->getValues(subset, labels); - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() << '\t' << labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -494,7 +494,7 @@ vector Venn::getPic(vector lookup, vector sharedac = vCalcs[i]->getValues(subset, labels); - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -506,7 +506,7 @@ vector Venn::getPic(vector lookup, vector sharedbc = vCalcs[i]->getValues(subset, labels); - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -519,7 +519,7 @@ vector Venn::getPic(vector lookup, vector sharedabc = vCalcs[i]->getValues(subset, labels); - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -674,7 +674,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedAB = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -687,7 +687,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedAC = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -700,7 +700,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedAD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -713,7 +713,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedBC = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -726,7 +726,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedBD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[1]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -739,7 +739,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedCD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[2]->getGroup() + "-" + lookup[3]->getGroup() << '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -754,7 +754,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedABC = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup()+ "-" + lookup[2]->getGroup()<< '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -767,7 +767,7 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedACD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; @@ -780,12 +780,12 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset,labels); sharedBCD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[1]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; } - if (labels.size() != 0) { outShared << labels[labels.size()-1]; } + outShared << labels[labels.size()-1]; outShared << endl; } //cout << "num bcd = " << sharedBCD << endl; @@ -793,19 +793,19 @@ vector Venn::getPic(vector lookup, vectorgetValues(subset, labels); sharedABD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ","; } - if (labels.size() != 0) { outShared << labels[labels.size()-1]; } + outShared << labels[labels.size()-1]; outShared << endl; } //cout << "num abd = " << sharedABD << endl; //get estimate for all four data = vCalcs[i]->getValues(lookup, labels); sharedABCD = data[0]; - if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs")) { + if (sharedOtus && (vCalcs[i]->getName() == "sharedsobs") && (labels.size() != 0)) { outShared << lookup[0]->getGroup() + "-" + lookup[1]->getGroup() + "-" + lookup[2]->getGroup()+ "-" + lookup[3]->getGroup()<< '\t'<< labels.size() << '\t'; for (int k = 0; k < labels.size()-1; k++) { outShared << labels[k] << ",";