From d6c0a11d1cecfac18b323285e7ffadb7f58e848f Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Mon, 20 Feb 2012 15:33:22 -0500 Subject: [PATCH] removed unused copy constructors and comments within comments that where causing nonsense warnings. Added minimum sequence length of 3 bases to correct set fault error to chimera.perseus. Fixed == bug in chop.seqs that left 'N' bases in sequence. added classify.tree command. worked on cluster.split and shhh.flows paralellization for windows, not complete. optimized shhh.flows paralellzation for mac and linux. added linker and spacer options to oligos file of trim.seqs. added ldiffs and sdiffs to allow for differences in linker and spacers in trim.seqs. added keepforward to trim.seqs to allow for keeping the forward primer. --- Mothur.xcodeproj/project.pbxproj | 12 +- alignmentdb.cpp | 27 - alignmentdb.h | 1 - bayesian.cpp | 7 +- blastdb.hpp | 2 - chimeraperseuscommand.cpp | 14 +- chimeraperseuscommand.h | 2 +- chopseqscommand.cpp | 8 +- classifyseqscommand.cpp | 2 +- classifyseqscommand.h | 2 +- classifytreecommand.cpp | 579 +++++++ classifytreecommand.h | 53 + clusterclassic.cpp | 6 +- clustersplitcommand.cpp | 207 ++- clustersplitcommand.h | 206 ++- commandfactory.cpp | 7 +- countseqscommand.cpp | 7 +- database.hpp | 1 - decalc.cpp | 4 +- distancecommand.cpp | 4 +- distancedb.cpp | 4 - distancedb.hpp | 1 - eachgapdist.h | 1 - hcluster.cpp | 2 +- ignoregaps.h | 1 - kmerdb.hpp | 1 - maligner.cpp | 2 +- mothurmetastats.cpp | 8 +- mothurout.cpp | 16 +- mothurout.h | 16 + myPerseus.cpp | 6 +- onegapdist.h | 1 - onegapignore.h | 1 - phylodiversity.cpp | 4 +- preclustercommand.cpp | 17 +- referencedb.cpp | 2 +- seqnoise.cpp | 2 +- sequence.hpp | 3 - sharedrabundfloatvector.cpp | 6 +- sharedrabundvector.cpp | 2 +- shhhercommand.cpp | 2582 ++++++++++++++++++++---------- shhhercommand.h | 1379 ++++++++++++++-- suffixdb.hpp | 7 - suffixnodes.hpp | 2 - suffixtree.cpp | 19 - suffixtree.hpp | 2 - tree.cpp | 2 +- trimoligos.cpp | 167 +- trimoligos.h | 15 +- trimseqscommand.cpp | 43 +- trimseqscommand.h | 6 +- 51 files changed, 4248 insertions(+), 1223 deletions(-) create mode 100644 classifytreecommand.cpp create mode 100644 classifytreecommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 2e61594..702e3bb 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -326,6 +326,7 @@ A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; }; A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; }; A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; }; + A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */; }; A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; }; A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; }; A7FA2AC714A0E881007C09A6 /* bsplvb.f in Sources */ = {isa = PBXBuildFile; fileRef = A7FA2ABC14A0E881007C09A6 /* bsplvb.f */; }; @@ -1041,6 +1042,8 @@ A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; }; A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = ""; }; A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = ""; }; + A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifytreecommand.cpp; sourceTree = ""; }; + A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifytreecommand.h; sourceTree = ""; }; A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = ""; }; A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = ""; }; A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = ""; }; @@ -1316,6 +1319,8 @@ A7E9B69012D37EC400DA6239 /* classifyotucommand.cpp */, A7E9B69312D37EC400DA6239 /* classifyseqscommand.h */, A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */, + A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */, + A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */, A7E9B69712D37EC400DA6239 /* clearcutcommand.h */, A7E9B69612D37EC400DA6239 /* clearcutcommand.cpp */, A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */, @@ -2283,6 +2288,7 @@ A7FA2B5B14A0F0C2007C09A6 /* intrv.f in Sources */, A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */, A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */, + A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -2297,7 +2303,7 @@ GCC_DYNAMIC_NO_PIC = NO; GCC_ENABLE_FIX_AND_CONTINUE = YES; GCC_MODEL_TUNING = G5; - GCC_OPTIMIZATION_LEVEL = 3; + GCC_OPTIMIZATION_LEVEL = 0; INSTALL_PATH = /usr/local/bin; PRODUCT_NAME = Mothur; SDKROOT = macosx10.6; @@ -2326,7 +2332,7 @@ GCC_ENABLE_SSE3_EXTENSIONS = NO; GCC_ENABLE_SSE41_EXTENSIONS = NO; GCC_ENABLE_SSE42_EXTENSIONS = NO; - GCC_OPTIMIZATION_LEVEL = 3; + GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../release\\\"\"", "VERSION=\"\\\"1.23.0\\\"\"", @@ -2363,7 +2369,7 @@ GCC_C_LANGUAGE_STANDARD = gnu99; GCC_GENERATE_DEBUGGING_SYMBOLS = NO; GCC_MODEL_TUNING = ""; - GCC_OPTIMIZATION_LEVEL = 3; + GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( "VERSION=\"\\\"1.19.0\\\"\"", "RELEASE_DATE=\"\\\"5/9/2011\\\"\"", diff --git a/alignmentdb.cpp b/alignmentdb.cpp index df4eaec..f59c5db 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -13,33 +13,6 @@ #include "blastdb.hpp" #include "referencedb.h" -/**************************************************************************************************/ -//deep copy -AlignmentDB::AlignmentDB(const AlignmentDB& adb) : numSeqs(adb.numSeqs), longest(adb.longest), method(adb.method), emptySequence(adb.emptySequence), threadID(adb.threadID) { - try { - - m = MothurOut::getInstance(); - if (adb.method == "blast") { - search = new BlastDB(*((BlastDB*)adb.search)); - }else if(adb.method == "kmer") { - search = new KmerDB(*((KmerDB*)adb.search)); - }else if(adb.method == "suffix") { - search = new SuffixDB(*((SuffixDB*)adb.search)); - }else { - m->mothurOut("[ERROR]: cannot create copy of alignment database, unrecognized method - " + adb.method); m->mothurOutEndLine(); - } - - for (int i = 0; i < adb.templateSequences.size(); i++) { - Sequence temp(adb.templateSequences[i]); - templateSequences.push_back(temp); - } - } - catch(exception& e) { - m->errorOut(e, "AlignmentDB", "AlignmentDB"); - exit(1); - } - -} /**************************************************************************************************/ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int tid){ // This assumes that the template database is in fasta format, may try { // need to alter this in the future? diff --git a/alignmentdb.h b/alignmentdb.h index 900aadc..537af8d 100644 --- a/alignmentdb.h +++ b/alignmentdb.h @@ -22,7 +22,6 @@ public: AlignmentDB(string, string, int, float, float, float, float, int); //reads fastafile passed in and stores sequences AlignmentDB(string); - AlignmentDB(const AlignmentDB& adb); ~AlignmentDB(); Sequence findClosestSequence(Sequence*); diff --git a/bayesian.cpp b/bayesian.cpp index f7ea6e4..eca63b1 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -111,10 +111,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { //initialze probabilities wordGenusProb.resize(numKmers); WordPairDiffArr.resize(numKmers); - //cout << numKmers << '\t' << genusNodes.size() << endl; + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } - //cout << numKmers << '\t' << genusNodes.size() << endl; - ofstream out; + ofstream out; ofstream out2; #ifdef USE_MPI @@ -505,7 +504,7 @@ map Bayesian::parseTaxMap(string newTax) { exit(1); } } -/**************************************************************************************************/ +**************************************************************************************************/ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) { try{ diff --git a/blastdb.hpp b/blastdb.hpp index e2f4f57..50a8379 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -18,8 +18,6 @@ class BlastDB : public Database { public: BlastDB(string, float, float, float, float, string, int); BlastDB(string, int); - BlastDB(const BlastDB& bdb) : dbFileName(bdb.dbFileName), queryFileName(bdb.queryFileName), blastFileName(bdb.blastFileName), path(bdb.path), - count(bdb.count), gapOpen(bdb.gapOpen), gapExtend(bdb.gapExtend), match(bdb.match), misMatch(bdb.misMatch), Database(bdb) {} ~BlastDB(); void generateDB(); diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index 8eaf536..76f0103 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -533,6 +533,7 @@ vector ChimeraPerseusCommand::loadSequences(SequenceParser& parser, str vector sequences; bool error = false; + alignLength = 0; for (int i = 0; i < thisGroupsSeqs.size(); i++) { @@ -543,6 +544,7 @@ vector ChimeraPerseusCommand::loadSequences(SequenceParser& parser, str else { int num = m->getNumNames(it->second); sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num)); + if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); } } } @@ -570,7 +572,8 @@ vector ChimeraPerseusCommand::readFiles(string inputFile, string name){ bool error = false; ifstream in; m->openInputFile(inputFile, in); - + alignLength = 0; + while (!in.eof()) { if (m->control_pressed) { in.close(); return sequences; } @@ -581,6 +584,7 @@ vector ChimeraPerseusCommand::readFiles(string inputFile, string name){ if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); } else { sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second)); + if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); } } } in.close(); @@ -625,7 +629,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque } int numSeqs = sequences.size(); - int alignLength = sequences[0].sequence.size(); + //int alignLength = sequences[0].sequence.size(); ofstream chimeraFile; ofstream accnosFile; @@ -641,7 +645,7 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque for(int i=0;icontrol_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } - + vector restricted = chimeras; vector > leftDiffs(numSeqs); @@ -662,7 +666,9 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque string dummyA, dummyB; - if(comparisons >= 2){ + if (sequences[i].sequence.size() < 3) { + chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl; + }else if(comparisons >= 2){ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } diff --git a/chimeraperseuscommand.h b/chimeraperseuscommand.h index 84563ac..4562750 100644 --- a/chimeraperseuscommand.h +++ b/chimeraperseuscommand.h @@ -44,7 +44,7 @@ private: bool abort; string fastafile, groupfile, outputDir, namefile; - int processors; + int processors, alignLength; double cutoff, alpha, beta; vector outputNames; diff --git a/chopseqscommand.cpp b/chopseqscommand.cpp index 68576cd..4e06201 100644 --- a/chopseqscommand.cpp +++ b/chopseqscommand.cpp @@ -233,7 +233,7 @@ string ChopSeqsCommand::getChopped(Sequence seq) { for (int i = 0; i < temp.length(); i++) { //eliminate N's - if (toupper(temp[i]) == 'N') { temp[i] == '.'; } + if (toupper(temp[i]) == 'N') { temp[i] = '.'; } numBasesCounted++; @@ -255,7 +255,7 @@ string ChopSeqsCommand::getChopped(Sequence seq) { for (int i = (temp.length()-1); i >= 0; i--) { //eliminate N's - if (toupper(temp[i]) == 'N') { temp[i] == '.'; } + if (toupper(temp[i]) == 'N') { temp[i] = '.'; } numBasesCounted++; @@ -283,7 +283,7 @@ string ChopSeqsCommand::getChopped(Sequence seq) { for (int i = 0; i < temp.length(); i++) { //eliminate N's if (toupper(temp[i]) == 'N') { - temp[i] == '.'; + temp[i] = '.'; tempLength--; if (tempLength < numbases) { stopSpot = 0; break; } } @@ -309,7 +309,7 @@ string ChopSeqsCommand::getChopped(Sequence seq) { for (int i = (temp.length()-1); i >= 0; i--) { //eliminate N's if (toupper(temp[i]) == 'N') { - temp[i] == '.'; + temp[i] = '.'; tempLength--; if (tempLength < numbases) { stopSpot = 0; break; } } diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 328cd58..794b961 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -881,7 +881,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string extension = ""; if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); } - classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flipThreshold); + classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip); pDataArray.push_back(tempclass); //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 0bf4a91..f0c67ba 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -163,7 +163,7 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); } - else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID, pDataArray->flipThreshold); } + else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } else { pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); pDataArray->m->mothurOutEndLine(); diff --git a/classifytreecommand.cpp b/classifytreecommand.cpp new file mode 100644 index 0000000..e1ef6d3 --- /dev/null +++ b/classifytreecommand.cpp @@ -0,0 +1,579 @@ +// +// classifytreecommand.cpp +// Mothur +// +// Created by Sarah Westcott on 2/20/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "classifytreecommand.h" +#include "phylotree.h" + +//********************************************************************************************************************** +vector ClassifyTreeCommand::setParameters(){ + try { + CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy); + CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup); + CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifyTreeCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The classify.tree command reads a tree and taxonomy file and output the concensus taxonomy for each node on the tree. \n"; + helpString += "If you provide a group file, the concensus for each group will also be provided. \n"; + helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n"; + helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n"; + helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n"; + helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n"; + helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n"; + helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "getHelpString"); + exit(1); + } +} + +//********************************************************************************************************************** +ClassifyTreeCommand::ClassifyTreeCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["tree"] = tempOutNames; + outputTypes["summary"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand"); + exit(1); + } +} +//********************************************************************************************************************** +ClassifyTreeCommand::ClassifyTreeCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + m->runParse = true; + m->clearGroups(); + m->clearAllGroups(); + m->Treenames.clear(); + m->names.clear(); + + vector tempOutNames; + outputTypes["tree"] = tempOutNames; + outputTypes["summary"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("tree"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["tree"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + + it = parameters.find("taxonomy"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["taxonomy"] = inputDir + it->second; } + } + } + + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //check for required parameters + treefile = validParameter.validFile(parameters, "tree", true); + if (treefile == "not open") { treefile = ""; abort = true; } + else if (treefile == "not found") { treefile = ""; + treefile = m->getTreeFile(); + if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; } + }else { m->setTreeFile(treefile); } + + taxonomyfile = validParameter.validFile(parameters, "taxonomy", true); + if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; } + else if (taxonomyfile == "not found") { taxonomyfile = ""; + taxonomyfile = m->getTaxonomyFile(); + if (taxonomyfile != "") { m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; } + }else { m->setTaxonomyFile(taxonomyfile); } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { namefile = ""; abort = true; } + else if (namefile == "not found") { namefile = ""; } + else { m->setNameFile(namefile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { groupfile = ""; abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + else { m->setGroupFile(groupfile); } + + string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; } + m->mothurConvert(temp, cutoff); + + if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; } + + if (namefile == "") { + vector files; files.push_back(treefile); + parser.getNameFile(files); + } + + } + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +int ClassifyTreeCommand::execute(){ + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint); + + int start = time(NULL); + + /***************************************************/ + // reading tree info // + /***************************************************/ + m->setTreeFile(treefile); + if (groupfile != "") { + //read in group map info. + tmap = new TreeMap(groupfile); + tmap->readMap(); + }else{ //fake out by putting everyone in one group + Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap + tmap = new TreeMap(); + + for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); } + } + + if (namefile != "") { readNamesFile(); } + + read = new ReadNewickTree(treefile); + int readOk = read->read(tmap); + + if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; } + + read->AssembleTrees(); + vector T = read->getTrees(); + Tree* outputTree = T[0]; + delete read; + + //make sure all files match + //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size. + int numNamesInTree; + if (namefile != "") { + if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); } + else { numNamesInTree = m->Treenames.size(); } + }else { numNamesInTree = m->Treenames.size(); } + + + //output any names that are in group file but not in tree + if (numNamesInTree < tmap->getNumSeqs()) { + for (int i = 0; i < tmap->namesOfSeqs.size(); i++) { + //is that name in the tree? + int count = 0; + for (int j = 0; j < m->Treenames.size(); j++) { + if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it + count++; + } + + if (m->control_pressed) { + delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear(); + m->clearGroups(); + return 0; + } + + //then you did not find it so report it + if (count == m->Treenames.size()) { + //if it is in your namefile then don't remove + map::iterator it = nameMap.find(tmap->namesOfSeqs[i]); + + if (it == nameMap.end()) { + m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine(); + tmap->removeSeq(tmap->namesOfSeqs[i]); + i--; //need this because removeSeq removes name from namesOfSeqs + } + } + } + } + + if (m->control_pressed) { delete outputTree; delete tmap; return 0; } + + readTaxonomyFile(); + + + /***************************************************/ + // get concensus taxonomies // + /***************************************************/ + getClassifications(outputTree); + delete outputTree; delete tmap; + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //set tree file as new current treefile + if (treefile != "") { + string current = ""; + itTypes = outputTypes.find("tree"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); } + } + } + + m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +//traverse tree finding concensus taxonomy at each node +//label node with a number to relate to output summary file +//report all concensus taxonomies to file +int ClassifyTreeCommand::getClassifications(Tree*& T){ + try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(treefile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary"; + outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName); + + ofstream out; + m->openOutputFile(outputFileName, out); + out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + //print headings + out << "TreeNode\t"; + if (groupfile != "") { out << "Group\t"; } + out << "NumRep\tTaxonomy" << endl; + + string treeOutputDir = outputDir; + if (outputDir == "") { treeOutputDir += m->hasPath(treefile); } + string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre"; + + //create a map from tree node index to names of descendants, save time later + map > > nodeToDescendants; //node# -> (groupName -> groupMembers) + for (int i = 0; i < T->getNumNodes(); i++) { + if (m->control_pressed) { return 0; } + + nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants); + } + + //for each node + for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) { + + if (m->control_pressed) { out.close(); return 0; } + + string tax = "not classifed"; + int size; + if (groupfile != "") { + for (map >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) { + if (itGroups->first != "AllGroups") { + tax = getTaxonomy(itGroups->second, size); + out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl; + } + } + }else { + string group = "AllGroups"; + tax = getTaxonomy(nodeToDescendants[i][group], size); + out << (i+1) << '\t' << size << '\t' << tax << endl; + } + + T->tree[i].setLabel((i+1)); + } + out.close(); + + ofstream outTree; + m->openOutputFile(outputTreeFileName, outTree); + outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName); + T->print(outTree, "both"); + outTree.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies"); + exit(1); + } +} +//********************************************************************************************************************** +string ClassifyTreeCommand::getTaxonomy(set names, int& size) { + try{ + string conTax = ""; + size = 0; + + //create a tree containing sequences from this bin + PhyloTree* phylo = new PhyloTree(); + + for (set::iterator it = names.begin(); it != names.end(); it++) { + + + //if namesfile include the names + if (namefile != "") { + + //is this sequence in the name file - namemap maps seqName -> repSeqName + map::iterator it2 = nameMap.find(*it); + + if (it2 == nameMap.end()) { //this name is not in name file, skip it + m->mothurOut((*it) + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine(); + }else{ + + //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique + map::iterator itTax = taxMap.find((it2->second)); + + if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it + + if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); } + else { m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); } + }else{ + //add seq to tree + int num = nameCount[(*it)]; // we know its there since we found it in nameMap + for (int i = 0; i < num; i++) { phylo->addSeqToTree((*it)+toString(i), it2->second); } + size += num; + } + } + + }else{ + //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique + map::iterator itTax = taxMap.find((*it)); + + if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it + m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); + }else{ + //add seq to tree + phylo->addSeqToTree((*it), itTax->second); + size++; + } + } + + if (m->control_pressed) { delete phylo; return conTax; } + + } + + //build tree + phylo->assignHeirarchyIDs(0); + + TaxNode currentNode = phylo->get(0); + int myLevel = 0; + //at each level + while (currentNode.children.size() != 0) { //you still have more to explore + + TaxNode bestChild; + int bestChildSize = 0; + + //go through children + for (map::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) { + + TaxNode temp = phylo->get(itChild->second); + + //select child with largest accesions - most seqs assigned to it + if (temp.accessions.size() > bestChildSize) { + bestChild = phylo->get(itChild->second); + bestChildSize = temp.accessions.size(); + } + + } + + //is this taxonomy above cutoff + int consensusConfidence = ceil((bestChildSize / (float) size) * 100); + + if (consensusConfidence >= cutoff) { //if yes, add it + conTax += bestChild.name + "(" + toString(consensusConfidence) + ");"; + myLevel++; + }else{ //if no, quit + break; + } + + //move down a level + currentNode = bestChild; + } + + if (myLevel != phylo->getMaxLevel()) { + while (myLevel != phylo->getMaxLevel()) { + conTax += "unclassified;"; + myLevel++; + } + } + if (conTax == "") { conTax = "no_consensus;"; } + + delete phylo; + + return conTax; + + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy"); + exit(1); + } +} + +//********************************************************************************************************************** +map > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map > > descendants){ + try { + map > names; + + map >::iterator it; + map >::iterator it2; + + int lc = T->tree[i].getLChild(); + int rc = T->tree[i].getRChild(); + + if (lc == -1) { //you are a leaf your only descendant is yourself + string group = tmap->getGroup(T->tree[i].getName()); + set mynames; mynames.insert(T->tree[i].getName()); + names[group] = mynames; //mygroup -> me + names["AllGroups"] = mynames; + }else{ //your descedants are the combination of your childrens descendants + names = descendants[lc]; + for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) { + it2 = names.find(it->first); //do we already have this group + if (it2 == names.end()) { //nope, so add it + names[it->first] = it->second; + }else { + for (set::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) { + names[it->first].insert(*it3); + } + } + + } + } + + return names; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "getDescendantList"); + exit(1); + } +} +//********************************************************************************************************************** +int ClassifyTreeCommand::readTaxonomyFile() { + try { + + ifstream in; + m->openInputFile(taxonomyfile, in); + + string name, tax; + + while(!in.eof()){ + in >> name >> tax; + m->gobble(in); + + //are there confidence scores, if so remove them + if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); } + + taxMap[name] = tax; + + if (m->control_pressed) { in.close(); taxMap.clear(); return 0; } + } + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile"); + exit(1); + } +} + +/*****************************************************************/ +int ClassifyTreeCommand::readNamesFile() { + try { + ifstream inNames; + m->openInputFile(namefile, inNames); + + string name, names; + + while(!inNames.eof()){ + inNames >> name; //read from first column A + inNames >> names; //read from second column A,B,C,D + m->gobble(inNames); + + //parse names into vector + vector theseNames; + m->splitAtComma(names, theseNames); + + for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; } + nameCount[name] = theseNames.size(); + + if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; } + } + inNames.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ClassifyTreeCommand", "readNamesFile"); + exit(1); + } +} + +/*****************************************************************/ + + diff --git a/classifytreecommand.h b/classifytreecommand.h new file mode 100644 index 0000000..bd0e1ce --- /dev/null +++ b/classifytreecommand.h @@ -0,0 +1,53 @@ +#ifndef Mothur_classifytreecommand_h +#define Mothur_classifytreecommand_h + +// +// classifytreecommand.h +// Mothur +// +// Created by Sarah Westcott on 2/20/12. +// Copyright (c) 2012 Schloss Lab. All rights reserved. +// + +#include "command.hpp" +#include "readtree.h" +#include "treemap.h" + +class ClassifyTreeCommand : public Command { +public: + ClassifyTreeCommand(string); + ClassifyTreeCommand(); + ~ClassifyTreeCommand(){} + + vector setParameters(); + string getCommandName() { return "classify.tree"; } + string getCommandCategory() { return "Phylotype Analysis"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Classify.tree"; } + string getDescription() { return "Find the concensus taxonomy for the descendant of each tree node"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + ReadTree* read; + TreeMap* tmap; + string treefile, taxonomyfile, groupfile, namefile, outputDir; + bool abort; + vector outputNames; + int numUniquesInName, cutoff; + map nameMap; + map nameCount; + map taxMap; + + int getClassifications(Tree*&); + map > getDescendantList(Tree*&, int, map > >); + string getTaxonomy(set, int&); + int readNamesFile(); + int readTaxonomyFile(); + +}; + + + +#endif diff --git a/clusterclassic.cpp b/clusterclassic.cpp index 0048dc6..1ce81c4 100644 --- a/clusterclassic.cpp +++ b/clusterclassic.cpp @@ -434,11 +434,11 @@ void ClusterClassic::print() { try { //update location of seqs in smallRow since they move to smallCol now for (int i = 0; i < dMatrix.size(); i++) { - cout << "row = " << i << '\t'; + m->mothurOut("row = " + toString(i) + "\t"); for (int j = 0; j < dMatrix[i].size(); j++) { - cout << dMatrix[i][j] << '\t'; + m->mothurOut(toString(dMatrix[i][j]) + "\t"); } - cout << endl; + m->mothurOutEndLine(); } } catch(exception& e) { diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 34caf65..bb9296f 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -8,12 +8,7 @@ */ #include "clustersplitcommand.h" -#include "readcluster.h" -#include "splitmatrix.h" -#include "readphylip.h" -#include "readcolumn.h" -#include "readmatrix.hpp" -#include "inputdata.h" + //********************************************************************************************************************** @@ -555,7 +550,7 @@ int ClusterSplitCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); #else - + ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED /////////////////////// //sanity check if (processors > distName.size()) { processors = distName.size(); } @@ -563,66 +558,8 @@ int ClusterSplitCommand::execute(){ if(processors == 1){ listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files }else{ - - //cout << processors << '\t' << distName.size() << endl; - vector < vector < map > > dividedNames; //distNames[1] = vector of filenames for process 1... - dividedNames.resize(processors); - - //for each file group figure out which process will complete it - //want to divide the load intelligently so the big files are spread between processes - for (int i = 0; i < distName.size(); i++) { - //cout << i << endl; - int processToAssign = (i+1) % processors; - if (processToAssign == 0) { processToAssign = processors; } - - dividedNames[(processToAssign-1)].push_back(distName[i]); - } - - //not lets reverse the order of ever other process, so we balance big files running with little ones - for (int i = 0; i < processors; i++) { - //cout << i << endl; - int remainder = ((i+1) % processors); - if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); } - } - - createProcesses(dividedNames); - - if (m->control_pressed) { return 0; } - - //get list of list file names from each process - for(int i=0;iopenInputFile(filename, in); - - in >> tag; m->gobble(in); - - while(!in.eof()) { - string tempName; - in >> tempName; m->gobble(in); - listFileNames.push_back(tempName); - } - in.close(); - m->mothurRemove((toString(processIDS[i]) + ".temp")); - - //get labels - filename = toString(processIDS[i]) + ".temp.labels"; - ifstream in2; - m->openInputFile(filename, in2); - - float tempCutoff; - in2 >> tempCutoff; m->gobble(in2); - if (tempCutoff < cutoff) { cutoff = tempCutoff; } - - while(!in2.eof()) { - string tempName; - in2 >> tempName; m->gobble(in2); - if (labels.count(tempName) == 0) { labels.insert(tempName); } - } - in2.close(); - m->mothurRemove((toString(processIDS[i]) + ".temp.labels")); - } - } + listFileNames = createProcesses(distName, labels); + } #else listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files #endif @@ -904,12 +841,35 @@ void ClusterSplitCommand::printData(ListVector* oldList){ } } //********************************************************************************************************************** -int ClusterSplitCommand::createProcesses(vector < vector < map > > dividedNames){ +vector ClusterSplitCommand::createProcesses(vector< map > distName, set& labels){ try { + + vector listFiles; + vector < vector < map > > dividedNames; //distNames[1] = vector of filenames for process 1... + dividedNames.resize(processors); + + //for each file group figure out which process will complete it + //want to divide the load intelligently so the big files are spread between processes + for (int i = 0; i < distName.size(); i++) { + //cout << i << endl; + int processToAssign = (i+1) % processors; + if (processToAssign == 0) { processToAssign = processors; } + + dividedNames[(processToAssign-1)].push_back(distName[i]); + if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); } + } + + //not lets reverse the order of ever other process, so we balance big files running with little ones + for (int i = 0; i < processors; i++) { + //cout << i << endl; + int remainder = ((i+1) % processors); + if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); } + } + + if (m->control_pressed) { return listFiles; } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - int exitCommand = 1; + int process = 1; processIDS.clear(); //loop through and create all the processes you want @@ -950,14 +910,99 @@ int ClusterSplitCommand::createProcesses(vector < vector < map > } } + //do your part + listFiles = cluster(dividedNames[0], labels); + //force parent to wait until all the processes are done - for (int i=0;iopenInputFile(filename, in); + + in >> tag; m->gobble(in); + + while(!in.eof()) { + string tempName; + in >> tempName; m->gobble(in); + listFiles.push_back(tempName); + } + in.close(); + m->mothurRemove((toString(processIDS[i]) + ".temp")); + + //get labels + filename = toString(processIDS[i]) + ".temp.labels"; + ifstream in2; + m->openInputFile(filename, in2); + + float tempCutoff; + in2 >> tempCutoff; m->gobble(in2); + if (tempCutoff < cutoff) { cutoff = tempCutoff; } + + while(!in2.eof()) { + string tempName; + in2 >> tempName; m->gobble(in2); + if (labels.count(tempName) == 0) { labels.insert(tempName); } + } + in2.close(); + m->mothurRemove((toString(processIDS[i]) + ".temp.labels")); + } + + + #else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the clusterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add labels. + ////////////////////////////////////////////////////////////////////////////////////////////////////// - return exitCommand; + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; itag; + //get listfiles created + for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); } + //get labels + set::iterator it; + for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); } + //check cutoff + if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + #endif + + return listFiles; } catch(exception& e) { @@ -969,18 +1014,19 @@ int ClusterSplitCommand::createProcesses(vector < vector < map > vector ClusterSplitCommand::cluster(vector< map > distNames, set& labels){ try { - Cluster* cluster; - SparseMatrix* matrix; - ListVector* list; - ListVector oldList; - RAbundVector* rabund; vector listFileNames; - double smallestCutoff = cutoff; //cluster each distance file for (int i = 0; i < distNames.size(); i++) { + + Cluster* cluster = NULL; + SparseMatrix* matrix = NULL; + ListVector* list = NULL; + ListVector oldList; + RAbundVector* rabund = NULL; + if (m->control_pressed) { return listFileNames; } string thisNamefile = distNames[i].begin()->second; @@ -1011,8 +1057,8 @@ vector ClusterSplitCommand::cluster(vector< map > distNa oldList = *list; matrix = read->getMatrix(); - delete read; - delete nameMap; + delete read; read = NULL; + delete nameMap; nameMap = NULL; #ifdef USE_MPI @@ -1097,6 +1143,7 @@ vector ClusterSplitCommand::cluster(vector< map > distNa } delete matrix; delete list; delete cluster; delete rabund; + matrix = NULL; list = NULL; cluster = NULL; rabund = NULL; listFile.close(); if (m->control_pressed) { //clean up diff --git a/clustersplitcommand.h b/clustersplitcommand.h index c4b236c..cf52ff2 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -16,7 +16,13 @@ #include "listvector.hpp" #include "cluster.hpp" #include "sparsematrix.hpp" - +#include "readcluster.h" +#include "splitmatrix.h" +#include "readphylip.h" +#include "readcolumn.h" +#include "readmatrix.hpp" +#include "inputdata.h" +#include "clustercommand.h" class ClusterSplitCommand : public Command { @@ -47,12 +53,208 @@ private: ofstream outList, outRabund, outSabund; void printData(ListVector*); - int createProcesses(vector < vector < map > >); + vector createProcesses(vector< map >, set&); vector cluster(vector< map >, set&); int mergeLists(vector, map, ListVector*); map completeListFile(vector, string, set&, ListVector*&); int createMergedDistanceFile(vector< map >); }; +/////////////////not working for Windows//////////////////////////////////////////////////////////// +// getting an access violation error. This is most likely caused by the +// threads stepping on eachother's structures, as I can run the thread function and the cluster fuction +// in separately without errors occuring. I suspect it may be in the use of the +// static class mothurOut, but I can't pinpoint the problem. All other objects are made new +// within the thread. MothurOut is used by almost all the classes in mothur, so if this was +// really the cause I would expect to see all the windows threaded commands to have issues, but not +// all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have +// anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector +// is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12 +//////////////////////////////////////////////////////////////////////////////////////////////////// +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct clusterData { + set labels; + vector < map > distNames; + string method; + MothurOut* m; + double cutoff, precision; + string tag, outputDir; + vector listFiles; + bool hard; + int length, threadID; + + + clusterData(){} + clusterData(vector < map > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) { + distNames = dv; + m = mout; + cutoff = cu; + method = me; + outputDir = ou; + hard = hd; + precision = pre; + length = len; + threadID = th; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ + clusterData* pDataArray; + pDataArray = (clusterData*)lpParam; + + try { + cout << "starting " << endl; + + double smallestCutoff = pDataArray->cutoff; + + //cluster each distance file + for (int i = 0; i < pDataArray->distNames.size(); i++) { + + Cluster* mycluster = NULL; + SparseMatrix* mymatrix = NULL; + ListVector* mylist = NULL; + ListVector myoldList; + RAbundVector* myrabund = NULL; + + if (pDataArray->m->control_pressed) { break; } + + string thisNamefile = pDataArray->distNames[i].begin()->second; + string thisDistFile = pDataArray->distNames[i].begin()->first; + cout << thisNamefile << '\t' << thisDistFile << endl; + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine(); + + ReadMatrix* myread = new ReadColumnMatrix(thisDistFile); + myread->setCutoff(pDataArray->cutoff); + NameAssignment* mynameMap = new NameAssignment(thisNamefile); + mynameMap->readMap(); + cout << "done reading " << thisNamefile << endl; + myread->read(mynameMap); + cout << "done reading " << thisDistFile << endl; + if (pDataArray->m->control_pressed) { delete myread; delete mynameMap; break; } + + mylist = myread->getListVector(); + myoldList = *mylist; + mymatrix = myread->getMatrix(); + cout << "here" << endl; + delete myread; myread = NULL; + delete mynameMap; mynameMap = NULL; + + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine(); + + myrabund = new RAbundVector(mylist->getRAbundVector()); + cout << "here" << endl; + //create cluster + if (pDataArray->method == "furthest") { mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + else if(pDataArray->method == "nearest"){ mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + else if(pDataArray->method == "average"){ mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + pDataArray->tag = mycluster->getTag(); + cout << "here" << endl; + if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); } + string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile)); + cout << "here" << endl; + ofstream listFile; + pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list", listFile); + cout << "here" << endl; + pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list"); + + float previousDist = 0.00000; + float rndPreviousDist = 0.00000; + + myoldList = *mylist; + + bool print_start = true; + int start = time(NULL); + double saveCutoff = pDataArray->cutoff; + + while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){ + + if (pDataArray->m->control_pressed) { //clean up + delete mymatrix; delete mylist; delete mycluster; delete myrabund; + listFile.close(); + for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); } + pDataArray->listFiles.clear(); break; + } + + mycluster->update(saveCutoff); + + float dist = mymatrix->getSmallDist(); + float rndDist; + if (pDataArray->hard) { + rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision); + }else{ + rndDist = pDataArray->m->roundDist(dist, pDataArray->precision); + } + + if(previousDist <= 0.0000 && dist != previousDist){ + myoldList.setLabel("unique"); + myoldList.print(listFile); + if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); } + } + else if(rndDist != rndPreviousDist){ + myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1)); + myoldList.print(listFile); + if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); } + } + + previousDist = dist; + rndPreviousDist = rndDist; + myoldList = *mylist; + } + + cout << "here2" << endl; + if(previousDist <= 0.0000){ + myoldList.setLabel("unique"); + myoldList.print(listFile); + if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); } + } + else if(rndPreviousDistcutoff){ + myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1)); + myoldList.print(listFile); + if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); } + } + + delete mymatrix; delete mylist; delete mycluster; delete myrabund; + mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL; + listFile.close(); + + if (pDataArray->m->control_pressed) { //clean up + for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); } + pDataArray->listFiles.clear(); break; + } + cout << "here3" << endl; + pDataArray->m->mothurRemove(thisDistFile); + pDataArray->m->mothurRemove(thisNamefile); + cout << "here4" << endl; + if (saveCutoff != pDataArray->cutoff) { + if (pDataArray->hard) { saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision); } + else { saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision); } + + pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine(); + } + cout << "here5" << endl; + if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; } + } + + pDataArray->cutoff = smallestCutoff; + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction"); + exit(1); + } +} +#endif + + + + #endif diff --git a/commandfactory.cpp b/commandfactory.cpp index 75fad85..303bf08 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -128,6 +128,7 @@ #include "summaryqualcommand.h" #include "otuassociationcommand.h" #include "sortseqscommand.h" +#include "classifytreecommand.h" /*******************************************************/ @@ -277,6 +278,7 @@ CommandFactory::CommandFactory(){ commands["shhh.seqs"] = "shhh.seqs"; commands["otu.association"] = "otu.association"; commands["sort.seqs"] = "sort.seqs"; + commands["classify.tree"] = "classify.tree"; commands["quit"] = "MPIEnabled"; } @@ -439,6 +441,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "shhh.seqs") { command = new ShhhSeqsCommand(optionString); } else if(commandName == "otu.association") { command = new OTUAssociationCommand(optionString); } else if(commandName == "sort.seqs") { command = new SortSeqsCommand(optionString); } + else if(commandName == "classify.tree") { command = new ClassifyTreeCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -585,6 +588,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "shhh.seqs") { pipecommand = new ShhhSeqsCommand(optionString); } else if(commandName == "otu.association") { pipecommand = new OTUAssociationCommand(optionString); } else if(commandName == "sort.seqs") { pipecommand = new SortSeqsCommand(optionString); } + else if(commandName == "classify.tree") { pipecommand = new ClassifyTreeCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -719,6 +723,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "shhh.seqs") { shellcommand = new ShhhSeqsCommand(); } else if(commandName == "otu.association") { shellcommand = new OTUAssociationCommand(); } else if(commandName == "sort.seqs") { shellcommand = new SortSeqsCommand(); } + else if(commandName == "classify.tree") { shellcommand = new ClassifyTreeCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; @@ -744,7 +749,7 @@ Command* CommandFactory::getCommand(){ exit(1); } } -/***********************************************************************/ +***********************************************************************/ bool CommandFactory::isValidCommand(string command) { try { diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 9cdd033..e83c603 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -174,7 +174,8 @@ int CountSeqsCommand::execute(){ //open input file ifstream in; m->openInputFile(namefile, in); - + + int total = 0; while (!in.eof()) { if (m->control_pressed) { break; } @@ -217,7 +218,7 @@ int CountSeqsCommand::execute(){ out << firstCol << '\t' << names.size() << endl; } - + total += names.size(); } in.close(); @@ -225,6 +226,8 @@ int CountSeqsCommand::execute(){ if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + m->mothurOutEndLine(); + m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("Output File Name: "); m->mothurOutEndLine(); m->mothurOut(outputFileName); m->mothurOutEndLine(); diff --git a/database.hpp b/database.hpp index b2817a7..49f3903 100644 --- a/database.hpp +++ b/database.hpp @@ -45,7 +45,6 @@ class Database { public: Database(); - Database(const Database& db) : numSeqs(db.numSeqs), longest(db.longest), searchScore(db.searchScore), results(db.results), Scores(db.Scores) { m = MothurOut::getInstance(); } virtual ~Database(); virtual void generateDB() = 0; virtual void addSequence(Sequence) = 0; //add sequence to search engine diff --git a/decalc.cpp b/decalc.cpp index 973340c..98545f8 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -599,7 +599,7 @@ cout << largest->second << '\t' << largest->first->score << '\t' << largest->fir } } -//*************************************************************************************************************** +*************************************************************************************************************** //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used... int DeCalculator::findLargestContrib(vector seen) { try{ @@ -624,7 +624,7 @@ int DeCalculator::findLargestContrib(vector seen) { exit(1); } } -//*************************************************************************************************************** +*************************************************************************************************************** void DeCalculator::removeContrib(int bad, vector& quan) { try{ diff --git a/distancecommand.cpp b/distancecommand.cpp index 65edcf7..0ac1787 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -1094,7 +1094,7 @@ int DistanceCommand::convertMatrix(string outputFile) { exit(1); } } -/************************************************************************************************** +************************************************************************************************** int DistanceCommand::convertToLowerTriangle(string outputFile) { try{ @@ -1188,7 +1188,7 @@ int DistanceCommand::convertToLowerTriangle(string outputFile) { exit(1); } } -/**************************************************************************************************/ +**************************************************************************************************/ //its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff, //but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it. //also check to make sure the 2 files have the same alignment length. diff --git a/distancedb.cpp b/distancedb.cpp index 2cbf5ca..27e2785 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -14,10 +14,6 @@ #include "onegapignore.h" -/**************************************************************************************************/ -DistanceDB::DistanceDB(const DistanceDB& ddb) : data(ddb.data), templateSeqsLength(ddb.templateSeqsLength), templateAligned(ddb.templateAligned), Database(ddb) { - distCalculator = new oneGapIgnoreTermGapDist(); -} /**************************************************************************************************/ DistanceDB::DistanceDB() : Database() { try { diff --git a/distancedb.hpp b/distancedb.hpp index d7e05db..2624d6d 100644 --- a/distancedb.hpp +++ b/distancedb.hpp @@ -19,7 +19,6 @@ class DistanceDB : public Database { public: DistanceDB(); - DistanceDB(const DistanceDB& ddb); ~DistanceDB() { delete distCalculator; } void generateDB() {} //doesn't generate a search db diff --git a/eachgapdist.h b/eachgapdist.h index fe9bc30..1aca10c 100644 --- a/eachgapdist.h +++ b/eachgapdist.h @@ -19,7 +19,6 @@ class eachGapDist : public Dist { public: eachGapDist() {} - eachGapDist(const eachGapDist& ddb) {} void calcDist(Sequence A, Sequence B){ int diff = 0; diff --git a/hcluster.cpp b/hcluster.cpp index f051465..6cd4531 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -752,7 +752,7 @@ seqDist HCluster::getNextDist(char* buffer, int& index, int size){ exit(1); } } -/***********************************************************************/ +***********************************************************************/ int HCluster::processFile() { try { string firstName, secondName; diff --git a/ignoregaps.h b/ignoregaps.h index ccc6b96..894f92d 100644 --- a/ignoregaps.h +++ b/ignoregaps.h @@ -21,7 +21,6 @@ class ignoreGaps : public Dist { public: ignoreGaps() {} - ignoreGaps(const ignoreGaps& ddb) {} void calcDist(Sequence A, Sequence B){ int diff = 0; diff --git a/kmerdb.hpp b/kmerdb.hpp index 4ae00b9..ead3d7e 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -26,7 +26,6 @@ class KmerDB : public Database { public: KmerDB(string, int); - KmerDB(const KmerDB& kdb) : kmerSize(kdb.kmerSize), maxKmer(kdb.maxKmer), count(kdb.count), kmerDBName(kdb.kmerDBName), kmerLocations(kdb.kmerLocations), Database(kdb) {} KmerDB(); ~KmerDB(); diff --git a/maligner.cpp b/maligner.cpp index f31e482..3d1d1c9 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -612,7 +612,7 @@ vector Maligner::extractHighestPath(vector > } } -//*************************************************************************************************************** +*************************************************************************************************************** vector Maligner::mapTraceRegionsToAlignment(vector path, vector seqs) { try { diff --git a/mothurmetastats.cpp b/mothurmetastats.cpp index ae82225..ae3ab80 100644 --- a/mothurmetastats.cpp +++ b/mothurmetastats.cpp @@ -217,7 +217,7 @@ int MothurMetastats::runMetastats(string outputFileName, vector< vector if(qvalues[i] < threshold){ m->mothurOut("Feature " + toString((i+1)) + " is significant, q = "); cout << qvalues[i]; - m->mothurOutJustToLog(toString(pvalues[i])); m->mothurOutEndLine(); + m->mothurOutJustToLog(toString(qvalues[i])); m->mothurOutEndLine(); } } @@ -492,6 +492,12 @@ int MothurMetastats::calc_twosample_ts(vector& Pmatrix, int secondGroupi vector MothurMetastats::calc_qvalues(vector& pValues) { try { + /* cout << "x <- c(" << pValues[0]; + for (int l = 1; l < pValues.size(); l++){ + cout << ", " << pValues[l]; + } + cout << ")\n";*/ + int numRows = pValues.size(); vector qvalues(numRows, 0.0); diff --git a/mothurout.cpp b/mothurout.cpp index 20a7b52..25e534e 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -212,8 +212,8 @@ void MothurOut::mothurOut(string output) { if (pid == 0) { //only one process should output to screen #endif - cout << output; out << output; + logger() << output; #ifdef USE_MPI } @@ -234,8 +234,8 @@ void MothurOut::mothurOutEndLine() { if (pid == 0) { //only one process should output to screen #endif - cout << endl; out << endl; + logger() << endl; #ifdef USE_MPI } @@ -257,13 +257,15 @@ void MothurOut::mothurOut(string output, ofstream& outputFile) { if (pid == 0) { //only one process should output to screen #endif - cout << output; + out << output; outputFile << output; + logger() << output; #ifdef USE_MPI } #endif + } catch(exception& e) { errorOut(e, "MothurOut", "MothurOut"); @@ -280,9 +282,9 @@ void MothurOut::mothurOutEndLine(ofstream& outputFile) { if (pid == 0) { //only one process should output to screen #endif - cout << endl; out << endl; outputFile << endl; + logger() << endl; #ifdef USE_MPI } @@ -726,7 +728,7 @@ string MothurOut::getFullPathName(string fileName){ }else if (path[(pos-1)] == '/') { //you want the current working dir ./ path = path.substr(0, pos); }else if (pos == 1) { break; //you are at the end - }else { cout << "cannot resolve path for " << fileName << endl; return fileName; } + }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; } } for (int i = index; i >= 0; i--) { @@ -772,7 +774,7 @@ string MothurOut::getFullPathName(string fileName){ }else if (path[(pos-1)] == '\\') { //you want the current working dir ./ path = path.substr(0, pos); }else if (pos == 1) { break; //you are at the end - }else { cout << "cannot resolve path for " << fileName << endl; return fileName; } + }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; } } for (int i = index; i >= 0; i--) { @@ -1151,7 +1153,7 @@ vector MothurOut::setFilePosEachLine(string filename, int& n while(isspace(d) && (d != in.eof())) { d=in.get(); count++;} } positions.push_back(count-1); - cout << count-1 << endl; + //cout << count-1 << endl; } in.close(); diff --git a/mothurout.h b/mothurout.h index d2a36b3..12de987 100644 --- a/mothurout.h +++ b/mothurout.h @@ -12,6 +12,22 @@ #include "mothur.h" +/***********************************************/ +struct logger { + + logger() {} + ~logger() {} + + template< class T > + logger& operator <<( const T& o ) { + cout << o; return *this; + } + + logger& operator<<(ostream& (*m)(ostream&) ) { + cout << m; return *this; + } + +}; /***********************************************/ class MothurOut { diff --git a/myPerseus.cpp b/myPerseus.cpp index 3cf38cd..f7cb65e 100644 --- a/myPerseus.cpp +++ b/myPerseus.cpp @@ -515,7 +515,7 @@ int Perseus::getChimera(vector sequences, for(int i=0;i sequences[bestLeft[l]].frequency)){ + if(((leftDiffs[i][l] < singleLeft[l]) && sequences[i].frequency) || ((leftDiffs[i][l] == singleLeft[l]) && (sequences[i].frequency > sequences[bestLeft[l]].frequency))){ singleLeft[l] = leftDiffs[i][l]; bestLeft[l] = i; } @@ -533,7 +533,7 @@ int Perseus::getChimera(vector sequences, for(int i=0;i sequences[bestRight[l]].frequency)){ + if((rightDiffs[i][l] < singleRight[l] && sequences[i].frequency) || ((rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency))){ singleRight[l] = rightDiffs[i][l]; bestRight[l] = i; } @@ -649,7 +649,7 @@ int Perseus::getTrimera(vector& sequences, if(restricted[i] == 0){ int delta = leftDiffs[i][y] - leftDiffs[i][x]; - if(delta < minDelta[x][y] || delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency){ + if(delta < minDelta[x][y] || (delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency)){ minDelta[x][y] = delta; minDeltaSeq[x][y] = i; } diff --git a/onegapdist.h b/onegapdist.h index 971e6b0..3e5b6c7 100644 --- a/onegapdist.h +++ b/onegapdist.h @@ -19,7 +19,6 @@ class oneGapDist : public Dist { public: oneGapDist() {} - oneGapDist(const oneGapDist& ddb) {} void calcDist(Sequence A, Sequence B){ diff --git a/onegapignore.h b/onegapignore.h index dba3f38..fdbc196 100644 --- a/onegapignore.h +++ b/onegapignore.h @@ -19,7 +19,6 @@ class oneGapIgnoreTermGapDist : public Dist { public: oneGapIgnoreTermGapDist() {} - oneGapIgnoreTermGapDist(const oneGapIgnoreTermGapDist& ddb) {} void calcDist(Sequence A, Sequence B){ diff --git a/phylodiversity.cpp b/phylodiversity.cpp index 86dc539..d5ccaf9 100644 --- a/phylodiversity.cpp +++ b/phylodiversity.cpp @@ -20,7 +20,7 @@ EstOutput PhyloDiversity::getValues(Tree* t, vector treeNodes, vector< vect //initialize Dscore for (int i=0; iGroups.size(); i++) { DScore[globaldata->Groups[i]] = 0.0; } - /******************************************************** + ******************************************************** //calculate a D value for each group for(int v=0;v treeNodes, vector< vect exit(1); } } -/**************************************************************************************************/ +**************************************************************************************************/ diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 582da49..8e4f327 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -514,7 +514,7 @@ int PreClusterCommand::readFASTA(){ m->openInputFile(fastafile, inFasta); //string firstCol, secondCol, nameString; - length = 0; + set lengths; while (!inFasta.eof()) { @@ -540,17 +540,20 @@ int PreClusterCommand::readFASTA(){ else{ seqPNode tempNode(itSize->second, seq, names[seq.getName()]); alignSeqs.push_back(tempNode); - if (seq.getAligned().length() > length) { length = seq.getAligned().length(); } + lengths.insert(seq.getAligned().length()); } }else { //no names file, you are identical to yourself seqPNode tempNode(1, seq, seq.getName()); alignSeqs.push_back(tempNode); - if (seq.getAligned().length() > length) { length = seq.getAligned().length(); } + lengths.insert(seq.getAligned().length()); } } } inFasta.close(); //inNames.close(); + + if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } + return alignSeqs.size(); } @@ -562,7 +565,7 @@ int PreClusterCommand::readFASTA(){ /**************************************************************************************************/ int PreClusterCommand::loadSeqs(map& thisName, vector& thisSeqs){ try { - length = 0; + set lengths; alignSeqs.clear(); map::iterator it; bool error = false; @@ -585,15 +588,17 @@ int PreClusterCommand::loadSeqs(map& thisName, vector& seqPNode tempNode(numReps, thisSeqs[i], it->second); alignSeqs.push_back(tempNode); - if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } + lengths.insert(thisSeqs[i].getAligned().length()); } }else { //no names file, you are identical to yourself seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName()); alignSeqs.push_back(tempNode); - if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); } + lengths.insert(thisSeqs[i].getAligned().length()); } } + if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); } + //sanity check if (error) { m->control_pressed = true; } diff --git a/referencedb.cpp b/referencedb.cpp index 36400c1..d91490e 100644 --- a/referencedb.cpp +++ b/referencedb.cpp @@ -27,5 +27,5 @@ void ReferenceDB::clearMemory() { } /******************************************************* ReferenceDB::~ReferenceDB() { myInstance = NULL; } -/*******************************************************/ +*******************************************************/ diff --git a/seqnoise.cpp b/seqnoise.cpp index 9ee8010..8e7a439 100644 --- a/seqnoise.cpp +++ b/seqnoise.cpp @@ -1016,4 +1016,4 @@ int main(int argc, char *argv[]){ return 0; } -/**************************************************************************************************/ +**************************************************************************************************/ diff --git a/sequence.hpp b/sequence.hpp index 6a50cb0..af0cba8 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -28,9 +28,6 @@ public: Sequence(string, string); Sequence(ifstream&); Sequence(istringstream&); - Sequence(const Sequence& se) : name(se.name), unaligned(se.unaligned), aligned(se.aligned), pairwise(se.pairwise), numBases(se.numBases), startPos(se.startPos), endPos(se.endPos), - alignmentLength(se.alignmentLength), isAligned(se.isAligned), longHomoPolymer(se.longHomoPolymer), ambigBases(se.ambigBases) { m = MothurOut::getInstance(); } - //these constructors just set the unaligned string to save space Sequence(string, string, string); Sequence(ifstream&, string); diff --git a/sharedrabundfloatvector.cpp b/sharedrabundfloatvector.cpp index bea2f8c..8abf920 100644 --- a/sharedrabundfloatvector.cpp +++ b/sharedrabundfloatvector.cpp @@ -348,7 +348,7 @@ SharedRAbundVector SharedRAbundFloatVector::getSharedRAbundVector(){ exit(1); } } -/***********************************************************************/ +***********************************************************************/ vector SharedRAbundFloatVector::getSharedRAbundFloatVectors(){ try { SharedUtil* util; @@ -419,7 +419,7 @@ SharedSAbundVector SharedRAbundVector::getSharedSAbundVector(){ exit(1); } } -/***********************************************************************/ +***********************************************************************/ SAbundVector SharedRAbundFloatVector::getSAbundVector() { try { @@ -461,7 +461,7 @@ SharedOrderVector SharedRAbundFloatVector::getSharedOrderVector() { exit(1); } } -/***********************************************************************/ +***********************************************************************/ //this is not functional, not sure how to handle it yet, but I need the stub because it is a pure function OrderVector SharedRAbundFloatVector::getOrderVector(map* nameMap = NULL) { try { diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp index 5ad4b2d..70b0960 100644 --- a/sharedrabundvector.cpp +++ b/sharedrabundvector.cpp @@ -55,7 +55,7 @@ SharedRAbundVector::SharedRAbundVector(string id, vector rav) : Data } -/***********************************************************************/ +***********************************************************************/ //reads a shared file SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), numBins(0), numSeqs(0) { try { diff --git a/shhhercommand.cpp b/shhhercommand.cpp index 5378211..5d8263c 100644 --- a/shhhercommand.cpp +++ b/shhhercommand.cpp @@ -9,15 +9,6 @@ #include "shhhercommand.h" -#include "readcolumn.h" -#include "readmatrix.hpp" -#include "rabundvector.hpp" -#include "sabundvector.hpp" -#include "listvector.hpp" -#include "cluster.hpp" -#include "sparsematrix.hpp" -#include - //********************************************************************************************************************** vector ShhherCommand::setParameters(){ try { @@ -76,18 +67,8 @@ ShhherCommand::ShhherCommand(){ ShhherCommand::ShhherCommand(string option) { try { - -#ifdef USE_MPI - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &ncpus); - - if(pid == 0){ -#endif - - abort = false; calledHelp = false; - //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } else if(option == "citation") { citation(); abort = true; calledHelp = true;} @@ -168,6 +149,67 @@ ShhherCommand::ShhherCommand(string option) { m->openOutputFile(compositeNamesFileName, temp); temp.close(); } + + if(flowFilesFileName != "not found"){ + string fName; + + ifstream flowFilesFile; + m->openInputFile(flowFilesFileName, flowFilesFile); + while(flowFilesFile){ + fName = m->getline(flowFilesFile); + + //test if file is valid + ifstream in; + int ableToOpen = m->openInputFile(fName, in, "noerror"); + in.close(); + if (ableToOpen == 1) { + if (inputDir != "") { //default path is set + string tryPath = inputDir + fName; + m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fName = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(fName); + m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fName = tryPath; + } + } + + //if you can't open it its not in current working directory or inputDir, try mothur excutable location + if (ableToOpen == 1) { + string exepath = m->argv; + string tempPath = exepath; + for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); } + exepath = exepath.substr(0, (tempPath.find_last_of('m'))); + + string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName); + m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fName = tryPath; + } + + if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); } + else { flowFileVector.push_back(fName); } + m->gobble(flowFilesFile); + } + flowFilesFile.close(); + if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; } + } + else{ + flowFileVector.push_back(flowFileName); + } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -272,11 +314,6 @@ ShhherCommand::ShhherCommand(string option) { } } - -#ifdef USE_MPI - } -#endif - } catch(exception& e) { m->errorOut(e, "ShhherCommand", "ShhherCommand"); @@ -291,7 +328,9 @@ int ShhherCommand::execute(){ int tag = 1976; MPI_Status status; - + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &ncpus); + if(pid == 0){ for(int i=1;icontrol_pressed) { return 0; } getJointLookUp(); if (m->control_pressed) { return 0; } - vector flowFileVector; - if(flowFilesFileName != "not found"){ - string fName; - - ifstream flowFilesFile; - m->openInputFile(flowFilesFileName, flowFilesFile); - while(flowFilesFile){ - fName = m->getline(flowFilesFile); - flowFileVector.push_back(fName); - m->gobble(flowFilesFile); - } - } - else{ - flowFileVector.push_back(flowFileName); - } int numFiles = flowFileVector.size(); for(int i=1;ierrorOut(e, "ShhherCommand", "flowDistParentFork"); + m->errorOut(e, "ShhherCommand", "flowDistMPI"); exit(1); } } +/**************************************************************************************************/ + +void ShhherCommand::getOTUData(string listFileName){ + try { + + ifstream listFile; + m->openInputFile(listFileName, listFile); + string label; + + listFile >> label >> numOTUs; + + otuData.assign(numSeqs, 0); + cumNumSeqs.assign(numOTUs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + aaP.clear();aaP.resize(numOTUs); + + seqNumber.clear(); + aaI.clear(); + seqIndex.clear(); + + string singleOTU = ""; + + for(int i=0;icontrol_pressed) { break; } + + listFile >> singleOTU; + + istringstream otuString(singleOTU); + + while(otuString){ + + string seqName = ""; + + for(int j=0;j::iterator nmIt = nameMap.find(seqName); + int index = nmIt->second; + + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + seqName = ""; + } + } + + map::iterator nmIt = nameMap.find(seqName); + + int index = nmIt->second; + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + + otuString.get(); + } + + sort(aaP[i].begin(), aaP[i].end()); + for(int j=0;jerrorOut(e, "ShhherCommand", "getOTUData"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::initPyroCluster(){ + try{ + if (numOTUs < processors) { processors = 1; } + + dist.assign(numSeqs * numOTUs, 0); + change.assign(numOTUs, 1); + centroids.assign(numOTUs, -1); + weight.assign(numOTUs, 0); + singleTau.assign(numSeqs, 1.0); + + nSeqsBreaks.assign(processors+1, 0); + nOTUsBreaks.assign(processors+1, 0); + + nSeqsBreaks[0] = 0; + for(int i=0;ierrorOut(e, "ShhherCommand", "initPyroCluster"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::fill(){ + try { + int index = 0; + for(int i=0;icontrol_pressed) { break; } + + cumNumSeqs[i] = index; + for(int j=0;jerrorOut(e, "ShhherCommand", "fill"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::getFlowData(){ + try{ + ifstream flowFile; + m->openInputFile(flowFileName, flowFile); + + string seqName; + seqNameVector.clear(); + lengths.clear(); + flowDataIntI.clear(); + nameMap.clear(); + + + int currentNumFlowCells; + + float intensity; + + flowFile >> numFlowCells; + int index = 0;//pcluster + while(!flowFile.eof()){ + + if (m->control_pressed) { break; } + + flowFile >> seqName >> currentNumFlowCells; + lengths.push_back(currentNumFlowCells); + + seqNameVector.push_back(seqName); + nameMap[seqName] = index++;//pcluster + + for(int i=0;i> intensity; + if(intensity > 9.99) { intensity = 9.99; } + int intI = int(100 * intensity + 0.0001); + flowDataIntI.push_back(intI); + } + m->gobble(flowFile); + } + flowFile.close(); + + numSeqs = seqNameVector.size(); + + for(int i=0;icontrol_pressed) { break; } + + int iNumFlowCells = i * numFlowCells; + for(int j=lengths[i];jerrorOut(e, "ShhherCommand", "getFlowData"); + exit(1); + } +} +/**************************************************************************************************/ +void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector& otuIndex){ + + try{ + vector newTau(numOTUs,0); + vector norms(numSeqs, 0); + otuIndex.clear(); + seqIndex.clear(); + singleTau.clear(); + + for(int i=startSeq;icontrol_pressed) { break; } + + double offset = 1e8; + int indexOffset = i * numOTUs; + + for(int j=0;j MIN_WEIGHT && change[j] == 1){ + dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); + } + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ + offset = dist[indexOffset + j]; + } + } + + for(int j=0;j MIN_WEIGHT){ + newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; + norms[i] += newTau[j]; + } + else{ + newTau[j] = 0.0; + } + } + + for(int j=0;j MIN_TAU){ + otuIndex.push_back(j); + seqIndex.push_back(i); + singleTau.push_back(newTau[j]); + } + } + + } + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ + + try{ + + int total = 0; + vector newTau(numOTUs,0); + vector norms(numSeqs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=startSeq;icontrol_pressed) { break; } + + int indexOffset = i * numOTUs; + + double offset = 1e8; + + for(int j=0;j MIN_WEIGHT && change[j] == 1){ + dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); + } + + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ + offset = dist[indexOffset + j]; + } + } + + for(int j=0;j MIN_WEIGHT){ + newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; + norms[i] += newTau[j]; + } + else{ + newTau[j] = 0.0; + } + } + + for(int j=0;j MIN_TAU){ + + int oldTotal = total; + + total++; + + singleTau.resize(total, 0); + seqNumber.resize(total, 0); + seqIndex.resize(total, 0); + + singleTau[oldTotal] = newTau[j]; + + aaP[j][nSeqsPerOTU[j]] = oldTotal; + aaI[j][nSeqsPerOTU[j]] = i; + nSeqsPerOTU[j]++; + } + } + + } + + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "calcNewDistancesParent"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::setOTUs(){ + + try { + vector bigTauMatrix(numOTUs * numSeqs, 0.0000); + + for(int i=0;icontrol_pressed) { break; } + + for(int j=0;j maxTau){ + maxTau = bigTauMatrix[i * numOTUs + j]; + maxOTU = j; + } + } + + otuData[i] = maxOTU; + } + + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=0;ierrorOut(e, "ShhherCommand", "setOTUs"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::getUniques(){ + try{ + + + numUniques = 0; + uniqueFlowgrams.assign(numFlowCells * numSeqs, -1); + uniqueCount.assign(numSeqs, 0); // anWeights + uniqueLengths.assign(numSeqs, 0); + mapSeqToUnique.assign(numSeqs, -1); + mapUniqueToSeq.assign(numSeqs, -1); + + vector uniqueFlowDataIntI(numFlowCells * numSeqs, -1); + + for(int i=0;icontrol_pressed) { break; } + + int index = 0; + + vector current(numFlowCells); + for(int j=0;j uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; } + break; + } + index++; + } + + if(index == numUniques){ + uniqueLengths[numUniques] = lengths[i]; + uniqueCount[numUniques] = 1; + mapSeqToUnique[i] = numUniques;//anMap + mapUniqueToSeq[numUniques] = i;//anF + + for(int k=0;kcontrol_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); } + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "getUniques"); + exit(1); + } +} + +/**************************************************************************************************/ + +float ShhherCommand::calcPairwiseDist(int seqA, int seqB){ + try{ + int minLength = lengths[mapSeqToUnique[seqA]]; + if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; } + + int ANumFlowCells = seqA * numFlowCells; + int BNumFlowCells = seqB * numFlowCells; + + float dist = 0; + + for(int i=0;icontrol_pressed) { break; } + + int flowAIntI = flowDataIntI[ANumFlowCells + i]; + float flowAPrI = flowDataPrI[ANumFlowCells + i]; + + int flowBIntI = flowDataIntI[BNumFlowCells + i]; + float flowBPrI = flowDataPrI[BNumFlowCells + i]; + dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; + } + + dist /= (float) minLength; + return dist; + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "calcPairwiseDist"); + exit(1); + } +} + +//**********************************************************************************************************************/ + +string ShhherCommand::cluster(string distFileName, string namesFileName){ + try { + + ReadMatrix* read = new ReadColumnMatrix(distFileName); + read->setCutoff(cutoff); + + NameAssignment* clusterNameMap = new NameAssignment(namesFileName); + clusterNameMap->readMap(); + read->read(clusterNameMap); + + ListVector* list = read->getListVector(); + SparseMatrix* matrix = read->getMatrix(); + + delete read; + delete clusterNameMap; + + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); + + Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); + string tag = cluster->getTag(); + + double clusterCutoff = cutoff; + while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){ + + if (m->control_pressed) { break; } + + cluster->update(clusterCutoff); + } + + list->setLabel(toString(cutoff)); + + string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; + ofstream listFile; + m->openOutputFile(listFileName, listFile); + list->print(listFile); + listFile.close(); + + delete matrix; delete cluster; delete rabund; delete list; + + return listFileName; + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "cluster"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::calcCentroidsDriver(int start, int finish){ + + //this function gets the most likely homopolymer length at a flow position for a group of sequences + //within an otu + + try{ + + for(int i=start;icontrol_pressed) { break; } + + double count = 0; + int position = 0; + int minFlowGram = 100000000; + double minFlowValue = 1e8; + change[i] = 0; //FALSE + + for(int j=0;j 0 && count > MIN_COUNT){ + vector adF(nSeqsPerOTU[i]); + vector anL(nSeqsPerOTU[i]); + + for(int j=0;jerrorOut(e, "ShhherCommand", "calcCentroidsDriver"); + exit(1); + } +} + +/**************************************************************************************************/ + +double ShhherCommand::getDistToCentroid(int cent, int flow, int length){ + try{ + + int flowAValue = cent * numFlowCells; + int flowBValue = flow * numFlowCells; + + double dist = 0; + + for(int i=0;ierrorOut(e, "ShhherCommand", "getDistToCentroid"); + exit(1); + } +} + +/**************************************************************************************************/ + +double ShhherCommand::getNewWeights(){ + try{ + + double maxChange = 0; + + for(int i=0;icontrol_pressed) { break; } + + double difference = weight[i]; + weight[i] = 0; + + for(int j=0;j maxChange){ maxChange = difference; } + } + return maxChange; + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "getNewWeights"); + exit(1); + } +} + + /**************************************************************************************************/ + +double ShhherCommand::getLikelihood(){ + + try{ + + vector P(numSeqs, 0); + int effNumOTUs = 0; + + for(int i=0;i MIN_WEIGHT){ + effNumOTUs++; + } + } + + string hold; + for(int i=0;icontrol_pressed) { break; } + + for(int j=0;jerrorOut(e, "ShhherCommand", "getNewWeights"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::checkCentroids(){ + try{ + vector unique(numOTUs, 1); + + for(int i=0;icontrol_pressed) { break; } + + if(unique[i] == 1){ + for(int j=i+1;jerrorOut(e, "ShhherCommand", "checkCentroids"); + exit(1); + } +} + /**************************************************************************************************/ + + + +void ShhherCommand::writeQualities(vector otuCounts){ + + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } + string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual"; + + ofstream qualityFile; + m->openOutputFile(qualityFileName, qualityFile); + + qualityFile.setf(ios::fixed, ios::floatfield); + qualityFile.setf(ios::showpoint); + qualityFile << setprecision(6); + + vector > qualities(numOTUs); + vector pr(HOMOPS, 0); + + + for(int i=0;icontrol_pressed) { break; } + + int index = 0; + int base = 0; + + if(nSeqsPerOTU[i] > 0){ + qualities[i].assign(1024, -1); + + while(index < numFlowCells){ + double maxPrValue = 1e8; + short maxPrIndex = -1; + double count = 0.0000; + + pr.assign(HOMOPS, 0); + + for(int j=0;j MIN_COUNT){ + double U = 0.0000; + double norm = 0.0000; + + for(int s=0;s0.00){ + temp = log10(U); + } + else{ + temp = -10.1; + } + temp = floor(-10 * temp); + value = (int)floor(temp); + if(value > 100){ value = 100; } + + qualities[i][base] = (int)value; + base++; + } + } + + index++; + } + } + + + if(otuCounts[i] > 0){ + qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; + + int j=4; //need to get past the first four bases + while(qualities[i][j] != -1){ + qualityFile << qualities[i][j] << ' '; + j++; + } + qualityFile << endl; + } + } + qualityFile.close(); + outputNames.push_back(qualityFileName); + + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "writeQualities"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::writeSequences(vector otuCounts){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } + string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta"; + ofstream fastaFile; + m->openOutputFile(fastaFileName, fastaFile); + + vector names(numOTUs, ""); + + for(int i=0;icontrol_pressed) { break; } + + int index = centroids[i]; + + if(otuCounts[i] > 0){ + fastaFile << '>' << seqNameVector[aaI[i][0]] << endl; + + string newSeq = ""; + + for(int j=0;jappendFiles(fastaFileName, compositeFASTAFileName); + } + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "writeSequences"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::writeNames(vector otuCounts){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } + string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names"; + ofstream nameFile; + m->openOutputFile(nameFileName, nameFile); + + for(int i=0;icontrol_pressed) { break; } + + if(otuCounts[i] > 0){ + nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]]; + + for(int j=1;jappendFiles(nameFileName, compositeNamesFileName); + } + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "writeNames"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::writeGroups(){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } + string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)); + string groupFileName = fileRoot + "shhh.groups"; + ofstream groupFile; + m->openOutputFile(groupFileName, groupFile); + + for(int i=0;icontrol_pressed) { break; } + groupFile << seqNameVector[i] << '\t' << fileRoot << endl; + } + groupFile.close(); + outputNames.push_back(groupFileName); + + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "writeGroups"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::writeClusters(vector otuCounts){ + try { + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } + string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts"; + ofstream otuCountsFile; + m->openOutputFile(otuCountsFileName, otuCountsFile); + + string bases = flowOrder; + + for(int i=0;icontrol_pressed) { + break; + } + //output the translated version of the centroid sequence for the otu + if(otuCounts[i] > 0){ + int index = centroids[i]; + + otuCountsFile << "ideal\t"; + for(int j=8;jerrorOut(e, "ShhherCommand", "writeClusters"); + exit(1); + } +} #else //********************************************************************************************************************** @@ -755,55 +1818,213 @@ int ShhherCommand::execute(){ getSingleLookUp(); if (m->control_pressed) { return 0; } getJointLookUp(); if (m->control_pressed) { return 0; } - - vector flowFileVector; - if(flowFilesFileName != "not found"){ - string fName; + int numFiles = flowFileVector.size(); + + if (numFiles < processors) { processors = numFiles; } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); } + else { createProcesses(flowFileVector); } //each processor processes one file +#else + driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); +#endif + + if(compositeFASTAFileName != ""){ + outputNames.push_back(compositeFASTAFileName); + outputNames.push_back(compositeNamesFileName); + } + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "execute"); + exit(1); + } +} +#endif +/**************************************************************************************************/ + +int ShhherCommand::createProcesses(vector filenames){ + try { + vector processIDS; + int process = 1; + int num = 0; + + //sanity check + if (filenames.size() < processors) { processors = filenames.size(); } + + //divide the groups between the processors + vector lines; + int numFilesPerProcessor = filenames.size() / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numFilesPerProcessor; + int endIndex = (i+1) * numFilesPerProcessor; + if(i == (processors - 1)){ endIndex = filenames.size(); } + lines.push_back(linePair(startIndex, endIndex)); + } + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); - ifstream flowFilesFile; - m->openInputFile(flowFilesFileName, flowFilesFile); - while(flowFilesFile){ - fName = m->getline(flowFilesFile); - flowFileVector.push_back(fName); - m->gobble(flowFilesFile); + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end); + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); } } - else{ - flowFileVector.push_back(flowFileName); + + //do my part + driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end); + + //force parent to wait until all the processes are done + for (int i=0;i pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; ioutputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } - for(int i=0;iappendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName); + m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName); + m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp")); + m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp")); + } + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "createProcesses"); + exit(1); + } +} +/**************************************************************************************************/ + +int ShhherCommand::driver(vector filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){ + try { + + for(int i=start;icontrol_pressed) { break; } - flowFileName = flowFileVector[i]; - - m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n"); + string flowFileName = filenames[i]; + + m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n"); m->mothurOut("Reading flowgrams...\n"); - getFlowData(); + + vector seqNameVector; + vector lengths; + vector flowDataIntI; + vector flowDataPrI; + map nameMap; + vector uniqueFlowgrams; + vector uniqueCount; + vector mapSeqToUnique; + vector mapUniqueToSeq; + vector uniqueLengths; + int numFlowCells; + + int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells); if (m->control_pressed) { break; } m->mothurOut("Identifying unique flowgrams...\n"); - getUniques(); + int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI); if (m->control_pressed) { break; } m->mothurOut("Calculating distances between flowgrams...\n"); - string distFileName = createDistFile(processors); - string namesFileName = createNamesFile(); + string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; + unsigned long long begTime = time(NULL); + double begClock = clock(); + + flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); + + m->mothurOutEndLine(); + m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); + + + string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; + createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq); if (m->control_pressed) { break; } m->mothurOut("\nClustering flowgrams...\n"); - string listFileName = cluster(distFileName, namesFileName); + string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; + cluster(listFileName, distFileName, namesFileName); if (m->control_pressed) { break; } + + vector otuData; + vector cumNumSeqs; + vector nSeqsPerOTU; + vector > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices + vector > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI + vector seqNumber; //tMaster->anP: the sequence id number sorted by OTU + vector seqIndex; //tMaster->anI; the index that corresponds to seqNumber + - getOTUData(listFileName); + int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap); if (m->control_pressed) { break; } @@ -811,16 +2032,35 @@ int ShhherCommand::execute(){ m->mothurRemove(namesFileName); m->mothurRemove(listFileName); - initPyroCluster(); + vector dist; //adDist - distance of sequences to centroids + vector change; //did the centroid sequence change? 0 = no; 1 = yes + vector centroids; //the representative flowgram for each cluster m + vector weight; + vector singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs) + vector nSeqsBreaks; + vector nOTUsBreaks; + + dist.assign(numSeqs * numOTUs, 0); + change.assign(numOTUs, 1); + centroids.assign(numOTUs, -1); + weight.assign(numOTUs, 0); + singleTau.assign(numSeqs, 1.0); + + nSeqsBreaks.assign(2, 0); + nOTUsBreaks.assign(2, 0); + + nSeqsBreaks[0] = 0; + nSeqsBreaks[1] = numSeqs; + nOTUsBreaks[1] = numOTUs; if (m->control_pressed) { break; } double maxDelta = 0; int iter = 0; - double begClock = clock(); - unsigned long long begTime = time(NULL); - + begClock = clock(); + begTime = time(NULL); + m->mothurOut("\nDenoising flowgrams...\n"); m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); @@ -830,92 +2070,88 @@ int ShhherCommand::execute(){ double cycClock = clock(); unsigned long long cycTime = time(NULL); - fill(); + fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); if (m->control_pressed) { break; } - - calcCentroids(); + + calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); if (m->control_pressed) { break; } - - maxDelta = getNewWeights(); if (m->control_pressed) { break; } - double nLL = getLikelihood(); if (m->control_pressed) { break; } - checkCentroids(); + + maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight); + + if (m->control_pressed) { break; } + + double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); + + if (m->control_pressed) { break; } + + checkCentroids(numOTUs, centroids, weight); if (m->control_pressed) { break; } - calcNewDistances(); + calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths); if (m->control_pressed) { break; } iter++; m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); - + } if (m->control_pressed) { break; } m->mothurOut("\nFinalizing...\n"); - fill(); + fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); if (m->control_pressed) { break; } - setOTUs(); + setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI); if (m->control_pressed) { break; } vector otuCounts(numOTUs, 0); for(int i=0;icontrol_pressed) { break; } - writeQualities(otuCounts); if (m->control_pressed) { break; } - writeSequences(otuCounts); if (m->control_pressed) { break; } - writeNames(otuCounts); if (m->control_pressed) { break; } - writeClusters(otuCounts); if (m->control_pressed) { break; } - writeGroups(); if (m->control_pressed) { break; } + calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); + + if (m->control_pressed) { break; } + + writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; } + writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; } + writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; } + writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; } + writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; } m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - - if(compositeFASTAFileName != ""){ - outputNames.push_back(compositeFASTAFileName); - outputNames.push_back(compositeNamesFileName); - } - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "execute"); - exit(1); - } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + return 0; + + }catch(exception& e) { + m->errorOut(e, "ShhherCommand", "driver"); + exit(1); + } } -#endif -/**************************************************************************************************/ -void ShhherCommand::getFlowData(){ +/**************************************************************************************************/ +int ShhherCommand::getFlowData(string filename, vector& thisSeqNameVector, vector& thisLengths, vector& thisFlowDataIntI, map& thisNameMap, int& numFlowCells){ try{ + ifstream flowFile; - m->openInputFile(flowFileName, flowFile); + + m->openInputFile(filename, flowFile); string seqName; - seqNameVector.clear(); - lengths.clear(); - flowDataIntI.clear(); - nameMap.clear(); - - int currentNumFlowCells; - float intensity; + thisSeqNameVector.clear(); + thisLengths.clear(); + thisFlowDataIntI.clear(); + thisNameMap.clear(); flowFile >> numFlowCells; int index = 0;//pcluster @@ -924,32 +2160,34 @@ void ShhherCommand::getFlowData(){ if (m->control_pressed) { break; } flowFile >> seqName >> currentNumFlowCells; - lengths.push_back(currentNumFlowCells); - - seqNameVector.push_back(seqName); - nameMap[seqName] = index++;//pcluster + thisLengths.push_back(currentNumFlowCells); + + thisSeqNameVector.push_back(seqName); + thisNameMap[seqName] = index++;//pcluster for(int i=0;i> intensity; if(intensity > 9.99) { intensity = 9.99; } int intI = int(100 * intensity + 0.0001); - flowDataIntI.push_back(intI); + thisFlowDataIntI.push_back(intI); } m->gobble(flowFile); } flowFile.close(); - numSeqs = seqNameVector.size(); + int numSeqs = thisSeqNameVector.size(); for(int i=0;icontrol_pressed) { break; } int iNumFlowCells = i * numFlowCells; - for(int j=lengths[i];j& mapUniqueToSeq, vector& mapSeqToUnique, vector& lengths, vector& flowDataPrI, vector& flowDataIntI){ + try{ + + ostringstream outStream; + outStream.setf(ios::fixed, ios::floatfield); + outStream.setf(ios::dec, ios::basefield); + outStream.setf(ios::showpoint); + outStream.precision(6); - int index = 0; - ifstream lookUpFile; - m->openInputFile(lookupFileName, lookUpFile); - - for(int i=0;icontrol_pressed) { break; } - float logFracFreq; - lookUpFile >> logFracFreq; - - for(int j=0;j> singleLookUp[index]; - index++; + for(int j=0;jerrorOut(e, "ShhherCommand", "getSingleLookUp"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::getJointLookUp(){ - try{ + if(i % 100 == 0){ + m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime)); + m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); + m->mothurOutEndLine(); + } + } - // the most likely joint probability (-log) that two intenities have the same polymer length - jointLookUp.resize(NUMBINS * NUMBINS, 0); + ofstream distFile(distFileName.c_str()); + distFile << outStream.str(); + distFile.close(); - for(int i=0;icontrol_pressed) { break; } - - for(int j=0;jcontrol_pressed) {} + else { + m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime)); + m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); + m->mothurOutEndLine(); } + + return 0; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getJointLookUp"); + m->errorOut(e, "ShhherCommand", "flowDistParentFork"); exit(1); } } - /**************************************************************************************************/ -double ShhherCommand::getProbIntensity(int intIntensity){ +float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector& mapSeqToUnique, vector& lengths, vector& flowDataPrI, vector& flowDataIntI){ try{ - - double minNegLogProb = 100000000; - + int minLength = lengths[mapSeqToUnique[seqA]]; + if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; } - for(int i=0;icontrol_pressed) { break; } - float negLogProb = singleLookUp[i * NUMBINS + intIntensity]; - if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; } + int flowAIntI = flowDataIntI[ANumFlowCells + i]; + float flowAPrI = flowDataPrI[ANumFlowCells + i]; + + int flowBIntI = flowDataIntI[BNumFlowCells + i]; + float flowBPrI = flowDataPrI[BNumFlowCells + i]; + dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; } - return minNegLogProb; + dist /= (float) minLength; + return dist; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getProbIntensity"); + m->errorOut(e, "ShhherCommand", "calcPairwiseDist"); exit(1); } } /**************************************************************************************************/ -void ShhherCommand::getUniques(){ +int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector& uniqueFlowgrams, vector& uniqueCount, vector& uniqueLengths, vector& mapSeqToUnique, vector& mapUniqueToSeq, vector& lengths, vector& flowDataPrI, vector& flowDataIntI){ try{ - - - numUniques = 0; + int numUniques = 0; uniqueFlowgrams.assign(numFlowCells * numSeqs, -1); uniqueCount.assign(numSeqs, 0); // anWeights uniqueLengths.assign(numSeqs, 0); @@ -1069,7 +2304,7 @@ void ShhherCommand::getUniques(){ for(int j=0;jcontrol_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); } + + return numUniques; } catch(exception& e) { m->errorOut(e, "ShhherCommand", "getUniques"); exit(1); } } - /**************************************************************************************************/ - -float ShhherCommand::calcPairwiseDist(int seqA, int seqB){ - try{ - int minLength = lengths[mapSeqToUnique[seqA]]; - if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; } - - int ANumFlowCells = seqA * numFlowCells; - int BNumFlowCells = seqB * numFlowCells; - - float dist = 0; - - for(int i=0;icontrol_pressed) { break; } - - int flowAIntI = flowDataIntI[ANumFlowCells + i]; - float flowAPrI = flowDataPrI[ANumFlowCells + i]; - - int flowBIntI = flowDataIntI[BNumFlowCells + i]; - float flowBPrI = flowDataPrI[BNumFlowCells + i]; - dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; - } - - dist /= (float) minLength; - return dist; - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "calcPairwiseDist"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){ - try{ - - ostringstream outStream; - outStream.setf(ios::fixed, ios::floatfield); - outStream.setf(ios::dec, ios::basefield); - outStream.setf(ios::showpoint); - outStream.precision(6); - - int begTime = time(NULL); - double begClock = clock(); - - for(int i=startSeq;icontrol_pressed) { break; } - - for(int j=0;jmothurOut(toString(i) + "\t" + toString(time(NULL) - begTime)); - m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); - m->mothurOutEndLine(); - } - } - - ofstream distFile(distFileName.c_str()); - distFile << outStream.str(); - distFile.close(); - - if (m->control_pressed) {} - else { - m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime)); - m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); - m->mothurOutEndLine(); - } - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "flowDistParentFork"); - exit(1); - } -} - -/**************************************************************************************************/ - -string ShhherCommand::createDistFile(int processors){ - try{ -//////////////////////// until I figure out the shared memory issue ////////////////////// -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) -#else - processors=1; -#endif -//////////////////////// until I figure out the shared memory issue ////////////////////// - - string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; - - unsigned long long begTime = time(NULL); - double begClock = clock(); - - if (numSeqs < processors){ processors = 1; } - - if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); } - - else{ //you have multiple processors - - vector start(processors, 0); - vector end(processors, 0); - - int process = 1; - vector processIDs; - - for (int i = 0; i < processors; i++) { - start[i] = int(sqrt(float(i)/float(processors)) * numUniques); - end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques); - } - -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later - process++; - }else if (pid == 0){ - flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]); - exit(0); - }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); - perror(" : "); - for (int i=0;i pDataArray; - DWORD dwThreadIdArray[processors-1]; - HANDLE hThreadArray[processors-1]; - - //Create processor worker threads. - for(int i = 0; i < processors-1; i++){ - // Allocate memory for thread data. - string extension = extension = toString(i) + ".temp"; - - flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i); - pDataArray.push_back(tempdist); - processIDs.push_back(i); - - //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. - //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier - hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); - } - - //parent does its part - flowDistParentFork(fDistFileName, start[0], end[0]); - - //Wait until all threads have terminated. - WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); - - //Close all thread handles and free memory allocations. - for(int i=0; i < pDataArray.size(); i++){ - CloseHandle(hThreadArray[i]); - delete pDataArray[i]; - } - -#endif - - //append and remove temp files - for (int i=0;iappendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName); - m->mothurRemove((fDistFileName + toString(processIDs[i]) + ".temp")); - } - - } - - m->mothurOutEndLine(); - m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); - - return fDistFileName; - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "createDistFile"); - exit(1); - } - -} - -/**************************************************************************************************/ - -string ShhherCommand::createNamesFile(){ +int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector& seqNameVector, vector& mapSeqToUnique, vector& mapUniqueToSeq){ try{ vector duplicateNames(numUniques, ""); @@ -1335,31 +2366,29 @@ string ShhherCommand::createNamesFile(){ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ','; } - string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; - ofstream nameFile; - m->openOutputFile(nameFileName, nameFile); + m->openOutputFile(filename, nameFile); for(int i=0;icontrol_pressed) { break; } -// nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; + // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; } nameFile.close(); - return nameFileName; + + return 0; } catch(exception& e) { m->errorOut(e, "ShhherCommand", "createNamesFile"); exit(1); } } - //********************************************************************************************************************** -string ShhherCommand::cluster(string distFileName, string namesFileName){ +int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){ try { ReadMatrix* read = new ReadColumnMatrix(distFileName); @@ -1374,7 +2403,7 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){ delete read; delete clusterNameMap; - + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); @@ -1390,33 +2419,39 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){ list->setLabel(toString(cutoff)); - string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; ofstream listFile; - m->openOutputFile(listFileName, listFile); + m->openOutputFile(filename, listFile); list->print(listFile); listFile.close(); delete matrix; delete cluster; delete rabund; delete list; - - return listFileName; + + return 0; } catch(exception& e) { m->errorOut(e, "ShhherCommand", "cluster"); exit(1); } } - /**************************************************************************************************/ -void ShhherCommand::getOTUData(string listFileName){ +int ShhherCommand::getOTUData(int numSeqs, string fileName, vector& otuData, + vector& cumNumSeqs, + vector& nSeqsPerOTU, + vector >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices + vector >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI + vector& seqNumber, //tMaster->anP: the sequence id number sorted by OTU + vector& seqIndex, + map& nameMap){ try { - + ifstream listFile; - m->openInputFile(listFileName, listFile); + m->openInputFile(fileName, listFile); string label; + int numOTUs; listFile >> label >> numOTUs; - + otuData.assign(numSeqs, 0); cumNumSeqs.assign(numOTUs, 0); nSeqsPerOTU.assign(numOTUs, 0); @@ -1431,11 +2466,11 @@ void ShhherCommand::getOTUData(string listFileName){ for(int i=0;icontrol_pressed) { break; } - + listFile >> singleOTU; istringstream otuString(singleOTU); - + while(otuString){ string seqName = ""; @@ -1460,10 +2495,10 @@ void ShhherCommand::getOTUData(string listFileName){ } map::iterator nmIt = nameMap.find(seqName); - + int index = nmIt->second; nameMap.erase(nmIt); - + otuData[index] = i; nSeqsPerOTU[i]++; aaP[i].push_back(index); @@ -1489,6 +2524,8 @@ void ShhherCommand::getOTUData(string listFileName){ seqIndex = seqNumber; listFile.close(); + + return numOTUs; } catch(exception& e) { @@ -1496,124 +2533,28 @@ void ShhherCommand::getOTUData(string listFileName){ exit(1); } } - -/**************************************************************************************************/ - -void ShhherCommand::initPyroCluster(){ - try{ - if (numOTUs < processors) { processors = 1; } - - dist.assign(numSeqs * numOTUs, 0); - change.assign(numOTUs, 1); - centroids.assign(numOTUs, -1); - weight.assign(numOTUs, 0); - singleTau.assign(numSeqs, 1.0); - - nSeqsBreaks.assign(processors+1, 0); - nOTUsBreaks.assign(processors+1, 0); - - nSeqsBreaks[0] = 0; - for(int i=0;ierrorOut(e, "ShhherCommand", "initPyroCluster"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::fill(){ - try { - int index = 0; - for(int i=0;icontrol_pressed) { break; } - - cumNumSeqs[i] = index; - for(int j=0;jerrorOut(e, "ShhherCommand", "fill"); - exit(1); - } -} - /**************************************************************************************************/ -void ShhherCommand::calcCentroids(){ - try{ - -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - - if(processors == 1) { - calcCentroidsDriver(0, numOTUs); - } - else{ //you have multiple processors - if (numOTUs < processors){ processors = 1; } - - int process = 1; - vector processIDs; - - //loop through and create all the processes you want - while (process != processors) { - int pid = vfork(); - - if (pid > 0) { - processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later - process++; - }else if (pid == 0){ - calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]); - exit(0); - }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); - perror(" : "); - for (int i=0;ierrorOut(e, "ShhherCommand", "calcCentroidsDriver"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::calcCentroidsDriver(int start, int finish){ +int ShhherCommand::calcCentroidsDriver(int numOTUs, + vector& cumNumSeqs, + vector& nSeqsPerOTU, + vector& seqIndex, + vector& change, //did the centroid sequence change? 0 = no; 1 = yes + vector& centroids, //the representative flowgram for each cluster m + vector& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs) + vector& mapSeqToUnique, + vector& uniqueFlowgrams, + vector& flowDataIntI, + vector& lengths, + int numFlowCells, + vector& seqNumber){ //this function gets the most likely homopolymer length at a flow position for a group of sequences //within an otu try{ - for(int i=start;icontrol_pressed) { break; } @@ -1626,7 +2567,7 @@ void ShhherCommand::calcCentroidsDriver(int start, int finish){ for(int j=0;j 0 && count > MIN_COUNT){ vector adF(nSeqsPerOTU[i]); vector anL(nSeqsPerOTU[i]); @@ -1656,7 +2597,7 @@ void ShhherCommand::calcCentroidsDriver(int start, int finish){ double tauValue = singleTau[seqNumber[index]]; for(int k=0;kerrorOut(e, "ShhherCommand", "calcCentroidsDriver"); exit(1); } } - /**************************************************************************************************/ -double ShhherCommand::getDistToCentroid(int cent, int flow, int length){ +double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector& uniqueFlowgrams, + vector& flowDataIntI, int numFlowCells){ try{ int flowAValue = cent * numFlowCells; int flowBValue = flow * numFlowCells; double dist = 0; - + for(int i=0;icontrol_pressed) { break; } - - double difference = weight[i]; - weight[i] = 0; - - for(int j=0;j maxChange){ maxChange = difference; } - } - return maxChange; - } - catch(exception& e) { - m->errorOut(e, "ShhherCommand", "getNewWeights"); - exit(1); - } -} - /**************************************************************************************************/ -double ShhherCommand::getLikelihood(){ - - try{ - - vector P(numSeqs, 0); - int effNumOTUs = 0; - - for(int i=0;i MIN_WEIGHT){ - effNumOTUs++; - } - } - - string hold; - for(int i=0;icontrol_pressed) { break; } - - for(int j=0;jerrorOut(e, "ShhherCommand", "getNewWeights"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::checkCentroids(){ - try{ - vector unique(numOTUs, 1); - - for(int i=0;icontrol_pressed) { break; } - - if(unique[i] == 1){ - for(int j=i+1;jerrorOut(e, "ShhherCommand", "checkCentroids"); - exit(1); - } -} - -/**************************************************************************************************/ - -void ShhherCommand::calcNewDistances(){ - try{ - -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - - if(processors == 1) { - calcNewDistancesParent(0, numSeqs); - } - else{ //you have multiple processors - if (numSeqs < processors){ processors = 1; } - - vector > child_otuIndex(processors); - vector > child_seqIndex(processors); - vector > child_singleTau(processors); - vector totals(processors); - - int process = 1; - vector processIDs; - - //loop through and create all the processes you want - while (process != processors) { - int pid = vfork(); - - if (pid > 0) { - processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later - process++; - }else if (pid == 0){ - calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]); - totals[process] = child_otuIndex[process].size(); - - exit(0); - }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); - perror(" : "); - for (int i=0;ierrorOut(e, "ShhherCommand", "calcNewDistances"); - exit(1); - } -} - -/**************************************************************************************************/ -#ifdef USE_MPI -void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector& otuIndex){ - - try{ - vector newTau(numOTUs,0); - vector norms(numSeqs, 0); - otuIndex.clear(); - seqIndex.clear(); - singleTau.clear(); - - for(int i=startSeq;icontrol_pressed) { break; } - - double offset = 1e8; - int indexOffset = i * numOTUs; - - for(int j=0;j MIN_WEIGHT && change[j] == 1){ - dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); - } - if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ - offset = dist[indexOffset + j]; - } - } - - for(int j=0;j MIN_WEIGHT){ - newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; - norms[i] += newTau[j]; - } - else{ - newTau[j] = 0.0; - } - } - - for(int j=0;j MIN_TAU){ - otuIndex.push_back(j); - seqIndex.push_back(i); - singleTau.push_back(newTau[j]); - } +double ShhherCommand::getNewWeights(int numOTUs, vector& cumNumSeqs, vector& nSeqsPerOTU, vector& singleTau, vector& seqNumber, vector& weight){ + try{ + + double maxChange = 0; + + for(int i=0;icontrol_pressed) { break; } + + double difference = weight[i]; + weight[i] = 0; + + for(int j=0;j maxChange){ maxChange = difference; } } + return maxChange; } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI"); + m->errorOut(e, "ShhherCommand", "getNewWeights"); exit(1); } } -#endif + /**************************************************************************************************/ -void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector& child_otuIndex, vector& child_seqIndex, vector& child_singleTau){ +double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector& nSeqsPerOTU, vector& seqNumber, vector& cumNumSeqs, vector& seqIndex, vector& dist, vector& weight){ try{ - vector newTau(numOTUs,0); - vector norms(numSeqs, 0); - child_otuIndex.resize(0); - child_seqIndex.resize(0); - child_singleTau.resize(0); - for(int i=startSeq;i P(numSeqs, 0); + int effNumOTUs = 0; + + for(int i=0;i MIN_WEIGHT){ + effNumOTUs++; + } + } + + string hold; + for(int i=0;icontrol_pressed) { break; } - double offset = 1e8; - int indexOffset = i * numOTUs; - - - for(int j=0;j MIN_WEIGHT && change[j] == 1){ - dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); - } + for(int j=0;j MIN_WEIGHT && dist[indexOffset + j] < offset){ - offset = dist[indexOffset + j]; - } + P[nI] += weight[i] * exp(-singleDist * sigma); } - - for(int j=0;j MIN_WEIGHT){ - newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; - norms[i] += newTau[j]; - } - else{ - newTau[j] = 0.0; - } + } + double nLL = 0.00; + for(int i=0;ierrorOut(e, "ShhherCommand", "getNewWeights"); + exit(1); + } +} + +/**************************************************************************************************/ + +int ShhherCommand::checkCentroids(int numOTUs, vector& centroids, vector& weight){ + try{ + vector unique(numOTUs, 1); + + for(int i=0;i MIN_TAU){ - child_otuIndex.push_back(j); - child_seqIndex.push_back(i); - child_singleTau.push_back(newTau[j]); + if (m->control_pressed) { break; } + + if(unique[i] == 1){ + for(int j=i+1;jerrorOut(e, "ShhherCommand", "calcNewDistancesChild"); + m->errorOut(e, "ShhherCommand", "checkCentroids"); exit(1); } } - /**************************************************************************************************/ -void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ +void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector& nSeqsPerOTU, vector& dist, + vector& weight, vector& change, vector& centroids, + vector >& aaP, vector& singleTau, vector >& aaI, + vector& seqNumber, vector& seqIndex, + vector& uniqueFlowgrams, + vector& flowDataIntI, int numFlowCells, vector& lengths){ try{ @@ -2030,26 +2781,26 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ vector newTau(numOTUs,0); vector norms(numSeqs, 0); nSeqsPerOTU.assign(numOTUs, 0); - - for(int i=startSeq;icontrol_pressed) { break; } int indexOffset = i * numOTUs; - + double offset = 1e8; for(int j=0;j MIN_WEIGHT && change[j] == 1){ - dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]); + dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells); } - + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ offset = dist[indexOffset + j]; } } - + for(int j=0;j MIN_WEIGHT){ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j]; @@ -2059,11 +2810,11 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ newTau[j] = 0.0; } } - + for(int j=0;j MIN_TAU){ @@ -2082,19 +2833,44 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){ nSeqsPerOTU[j]++; } } - + } - + } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "calcNewDistancesParent"); + m->errorOut(e, "ShhherCommand", "calcNewDistances"); exit(1); } } +/**************************************************************************************************/ +int ShhherCommand::fill(int numOTUs, vector& seqNumber, vector& seqIndex, vector& cumNumSeqs, vector& nSeqsPerOTU, vector >& aaP, vector >& aaI){ + try { + int index = 0; + for(int i=0;icontrol_pressed) { return 0; } + + cumNumSeqs[i] = index; + for(int j=0;jerrorOut(e, "ShhherCommand", "fill"); + exit(1); + } +} /**************************************************************************************************/ -void ShhherCommand::setOTUs(){ +void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector& seqNumber, vector& seqIndex, vector& cumNumSeqs, vector& nSeqsPerOTU, + vector& otuData, vector& singleTau, vector& dist, vector >& aaP, vector >& aaI){ try { vector bigTauMatrix(numOTUs * numSeqs, 0.0000); @@ -2137,26 +2913,28 @@ void ShhherCommand::setOTUs(){ nSeqsPerOTU[index]++; } - fill(); + + fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); } catch(exception& e) { - m->errorOut(e, "ShhherCommand", "calcNewDistances"); + m->errorOut(e, "ShhherCommand", "setOTUs"); exit(1); } } - /**************************************************************************************************/ -void ShhherCommand::writeQualities(vector otuCounts){ +void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector otuCounts, vector& nSeqsPerOTU, vector& seqNumber, + vector& singleTau, vector& flowDataIntI, vector& uniqueFlowgrams, vector& cumNumSeqs, + vector& mapUniqueToSeq, vector& seqNameVector, vector& centroids, vector >& aaI){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.qual"; - + if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual"; + ofstream qualityFile; m->openOutputFile(qualityFileName, qualityFile); - + qualityFile.setf(ios::fixed, ios::floatfield); qualityFile.setf(ios::showpoint); qualityFile << setprecision(6); @@ -2245,7 +3023,7 @@ void ShhherCommand::writeQualities(vector otuCounts){ } qualityFile.close(); outputNames.push_back(qualityFileName); - + } catch(exception& e) { m->errorOut(e, "ShhherCommand", "writeQualities"); @@ -2255,11 +3033,11 @@ void ShhherCommand::writeQualities(vector otuCounts){ /**************************************************************************************************/ -void ShhherCommand::writeSequences(vector otuCounts){ +void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector otuCounts, vector& uniqueFlowgrams, vector& seqNameVector, vector >& aaI, vector& centroids){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.fasta"; + if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta"; ofstream fastaFile; m->openOutputFile(fastaFileName, fastaFile); @@ -2288,11 +3066,11 @@ void ShhherCommand::writeSequences(vector otuCounts){ } } fastaFile.close(); - + outputNames.push_back(fastaFileName); - - if(compositeFASTAFileName != ""){ - m->appendFiles(fastaFileName, compositeFASTAFileName); + + if(thisCompositeFASTAFileName != ""){ + m->appendFiles(fastaFileName, thisCompositeFASTAFileName); } } catch(exception& e) { @@ -2303,11 +3081,11 @@ void ShhherCommand::writeSequences(vector otuCounts){ /**************************************************************************************************/ -void ShhherCommand::writeNames(vector otuCounts){ +void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector otuCounts, vector& seqNameVector, vector >& aaI, vector& nSeqsPerOTU){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.names"; + if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names"; ofstream nameFile; m->openOutputFile(nameFileName, nameFile); @@ -2329,8 +3107,8 @@ void ShhherCommand::writeNames(vector otuCounts){ outputNames.push_back(nameFileName); - if(compositeNamesFileName != ""){ - m->appendFiles(nameFileName, compositeNamesFileName); + if(thisCompositeNamesFileName != ""){ + m->appendFiles(nameFileName, thisCompositeNamesFileName); } } catch(exception& e) { @@ -2341,12 +3119,12 @@ void ShhherCommand::writeNames(vector otuCounts){ /**************************************************************************************************/ -void ShhherCommand::writeGroups(){ +void ShhherCommand::writeGroups(string filename, int numSeqs, vector& seqNameVector){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)); - string groupFileName = fileRoot + ".shhh.groups"; + if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename)); + string groupFileName = fileRoot + "shhh.groups"; ofstream groupFile; m->openOutputFile(groupFileName, groupFile); @@ -2356,7 +3134,7 @@ void ShhherCommand::writeGroups(){ } groupFile.close(); outputNames.push_back(groupFileName); - + } catch(exception& e) { m->errorOut(e, "ShhherCommand", "writeGroups"); @@ -2366,11 +3144,11 @@ void ShhherCommand::writeGroups(){ /**************************************************************************************************/ -void ShhherCommand::writeClusters(vector otuCounts){ +void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector otuCounts, vector& centroids, vector& uniqueFlowgrams, vector& seqNameVector, vector >& aaI, vector& nSeqsPerOTU, vector& lengths, vector& flowDataIntI){ try { string thisOutputDir = outputDir; - if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); } - string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.counts"; + if (outputDir == "") { thisOutputDir += m->hasPath(filename); } + string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts"; ofstream otuCountsFile; m->openOutputFile(otuCountsFileName, otuCountsFile); @@ -2403,7 +3181,7 @@ void ShhherCommand::writeClusters(vector otuCounts){ for(int k=0;k otuCounts){ } otuCountsFile.close(); outputNames.push_back(otuCountsFileName); - + } catch(exception& e) { m->errorOut(e, "ShhherCommand", "writeClusters"); @@ -2424,4 +3202,92 @@ void ShhherCommand::writeClusters(vector otuCounts){ } } -//********************************************************************************************************************** +/**************************************************************************************************/ + +void ShhherCommand::getSingleLookUp(){ + try{ + // these are the -log probabilities that a signal corresponds to a particular homopolymer length + singleLookUp.assign(HOMOPS * NUMBINS, 0); + + int index = 0; + ifstream lookUpFile; + m->openInputFile(lookupFileName, lookUpFile); + + for(int i=0;icontrol_pressed) { break; } + + float logFracFreq; + lookUpFile >> logFracFreq; + + for(int j=0;j> singleLookUp[index]; + index++; + } + } + lookUpFile.close(); + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "getSingleLookUp"); + exit(1); + } +} + +/**************************************************************************************************/ + +void ShhherCommand::getJointLookUp(){ + try{ + + // the most likely joint probability (-log) that two intenities have the same polymer length + jointLookUp.resize(NUMBINS * NUMBINS, 0); + + for(int i=0;icontrol_pressed) { break; } + + for(int j=0;jerrorOut(e, "ShhherCommand", "getJointLookUp"); + exit(1); + } +} + +/**************************************************************************************************/ + +double ShhherCommand::getProbIntensity(int intIntensity){ + try{ + + double minNegLogProb = 100000000; + + + for(int i=0;icontrol_pressed) { break; } + + float negLogProb = singleLookUp[i * NUMBINS + intIntensity]; + if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; } + } + + return minNegLogProb; + } + catch(exception& e) { + m->errorOut(e, "ShhherCommand", "getProbIntensity"); + exit(1); + } +} + + + + diff --git a/shhhercommand.h b/shhhercommand.h index b3a0071..a7af7da 100644 --- a/shhhercommand.h +++ b/shhhercommand.h @@ -12,6 +12,14 @@ #include "mothur.h" #include "command.hpp" +#include "readcolumn.h" +#include "readmatrix.hpp" +#include "rabundvector.hpp" +#include "sabundvector.hpp" +#include "listvector.hpp" +#include "cluster.hpp" +#include "sparsematrix.hpp" +#include //********************************************************************************************************************** @@ -42,20 +50,92 @@ public: void help() { m->mothurOut(getHelpString()); } private: + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + int abort; - string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName; int processors, maxIters; float cutoff, sigma, minDelta; string flowOrder; - - vector nSeqsBreaks; - vector nOTUsBreaks; + + vector outputNames; vector singleLookUp; vector jointLookUp; + vector flowFileVector; + + int driver(vector, string, string, int, int); + int createProcesses(vector); + int getFlowData(string, vector&, vector&, vector&, map&, int&); + int getUniques(int, int, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&); + int flowDistParentFork(int, string, int, vector&, vector&, vector&, vector&, vector&); + float calcPairwiseDist(int, int, int, vector&, vector&, vector&, vector&); + int createNamesFile(int, int, string, vector&, vector&, vector&); + int cluster(string, string, string); + int getOTUData(int numSeqs, string, vector&, vector&, vector&, vector >&, vector >&, vector&, vector&,map&); + int calcCentroidsDriver(int numOTUs, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, int, vector&); + double getDistToCentroid(int, int, int, vector&, vector&, int); + double getNewWeights(int, vector&, vector&, vector&, vector&, vector&); + + double getLikelihood(int, int, vector&, vector&, vector&, vector&, vector&, vector&); + int checkCentroids(int, vector&, vector&); + void calcNewDistances(int, int, vector& , vector&,vector& , vector& change, vector&,vector >&, vector&, vector >&, vector&, vector&, vector&, vector&, int, vector&); + int fill(int, vector&, vector&, vector&, vector&, vector >&, vector >&); + void setOTUs(int, int, vector&, vector&, vector&, vector&, + vector&, vector&, vector&, vector >&, vector >&); + void writeQualities(int, int, string, vector, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector >&); + void writeSequences(string, int, int, string, vector, vector&, vector&, vector >&, vector&); + void writeNames(string, int, string, vector, vector&, vector >&, vector&); + void writeGroups(string, int, vector&); + void writeClusters(string, int, int, vector, vector&, vector&, vector&, vector >&, vector&, vector&, vector&); + + void getSingleLookUp(); + void getJointLookUp(); + double getProbIntensity(int); - vector seqNameVector; + +#ifdef USE_MPI + string flowDistMPI(int, int); + void calcNewDistancesChildMPI(int, int, vector&); + + int pid, ncpus; + + void getFlowData(); + void getUniques(); + + float calcPairwiseDist(int, int); + void flowDistParentFork(string, int, int); + + string createDistFile(int); + string createNamesFile(); + string cluster(string, string); + + void getOTUData(string); + void initPyroCluster(); + void fill(); + void calcCentroids(); + void calcCentroidsDriver(int, int); + double getDistToCentroid(int, int, int); + double getNewWeights(); + double getLikelihood(); + void checkCentroids(); + void calcNewDistances(); + void calcNewDistancesParent(int, int); + void calcNewDistancesChild(int, int, vector&, vector&, vector&); + + + void setOTUs(); + void writeQualities(vector); + void writeSequences(vector); + void writeNames(vector); + void writeGroups(); + void writeClusters(vector); + + vector seqNameVector; vector lengths; vector flowDataIntI; vector flowDataPrI; @@ -77,50 +157,10 @@ private: vector mapSeqToUnique; vector mapUniqueToSeq; vector uniqueLengths; + int numSeqs, numUniques, numOTUs, numFlowCells; + vector nSeqsBreaks; + vector nOTUsBreaks; - vector outputNames; - - int numSeqs, numUniques, numOTUs, numFlowCells; - - void getSingleLookUp(); - void getJointLookUp(); - void getFlowData(); - void getUniques(); - double getProbIntensity(int); - float calcPairwiseDist(int, int); - void flowDistParentFork(string, int, int); - - string createDistFile(int); - string createNamesFile(); - string cluster(string, string); - - void getOTUData(string); - void initPyroCluster(); - void fill(); - void calcCentroids(); - void calcCentroidsDriver(int, int); - double getDistToCentroid(int, int, int); - double getNewWeights(); - double getLikelihood(); - void checkCentroids(); - void calcNewDistances(); - void calcNewDistancesParent(int, int); - void calcNewDistancesChild(int, int, vector&, vector&, vector&); - - - void setOTUs(); - void writeQualities(vector); - void writeSequences(vector); - void writeNames(vector); - void writeGroups(); - void writeClusters(vector); - - -#ifdef USE_MPI - string flowDistMPI(int, int); - void calcNewDistancesChildMPI(int, int, vector&); - - int pid, ncpus; #endif }; @@ -129,32 +169,34 @@ private: //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). -struct flowDistParentForkData { - string distFileName; - vector mapUniqueToSeq; - vector mapSeqToUnique; - vector lengths; - vector flowDataIntI; - vector flowDataPrI; +struct shhhFlowsData { + int threadID, maxIters; + float cutoff, sigma, minDelta; + string flowOrder; + vector singleLookUp; vector jointLookUp; + vector filenames; + vector outputNames; + string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir; + int start, stop; MothurOut* m; - int threadID, startSeq, stopSeq, numFlowCells; - float cutoff; - flowDistParentForkData(){} - flowDistParentForkData(string d, vector mapU, vector mapS, vector l, vector flowD, vector flowDa, vector j, MothurOut* mout, int st, int sp, int n, float cut, int tid) { - distFileName = d; - mapUniqueToSeq = mapU; - mapSeqToUnique = mapS; - lengths = l; - flowDataIntI = flowD; - flowDataPrI = flowDa; - jointLookUp = j; + shhhFlowsData(){} + shhhFlowsData(vector f, string cf, string cn, string ou, string flor, vector jl, vector sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) { + filenames = f; + thisCompositeFASTAFileName = cf; + thisCompositeNameFileName = cn; + outputDir = ou; + flowOrder = flor; m = mout; - startSeq = st; - stopSeq = sp; - numFlowCells = n; + start = st; + stop = sp; cutoff= cut; + sigma = si; + minDelta = mD; + maxIters = mx; + jointLookUp = jl; + singleLookUp = sl; threadID = tid; } }; @@ -162,83 +204,1158 @@ struct flowDistParentForkData { /**************************************************************************************************/ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) #else -static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){ - flowDistParentForkData* pDataArray; - pDataArray = (flowDistParentForkData*)lpParam; +static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ + shhhFlowsData* pDataArray; + pDataArray = (shhhFlowsData*)lpParam; try { - ostringstream outStream; - outStream.setf(ios::fixed, ios::floatfield); - outStream.setf(ios::dec, ios::basefield); - outStream.setf(ios::showpoint); - outStream.precision(6); - - int begTime = time(NULL); - double begClock = clock(); - string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-"; - cout << tempOut << endl; + + for(int l=pDataArray->start;lstop;l++){ + + if (pDataArray->m->control_pressed) { break; } + + string flowFileName = pDataArray->filenames[l]; + + pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n"); + pDataArray->m->mothurOut("Reading flowgrams...\n"); + + vector seqNameVector; + vector lengths; + vector flowDataIntI; + vector flowDataPrI; + map nameMap; + vector uniqueFlowgrams; + vector uniqueCount; + vector mapSeqToUnique; + vector mapUniqueToSeq; + vector uniqueLengths; + int numFlowCells; + + //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells); + /*****************************************************************************************************/ + + ifstream flowFile; + // cout << "herethread " << flowFileName << '\t' << &flowFile << endl; + pDataArray->m->openInputFile(flowFileName, flowFile); + + // cout << "herethread " << flowFileName << endl; + string seqName; + int currentNumFlowCells; + float intensity; + + flowFile >> numFlowCells; + int index = 0;//pcluster + while(!flowFile.eof()){ + + if (pDataArray->m->control_pressed) { flowFile.close(); return 0; } + + flowFile >> seqName >> currentNumFlowCells; + lengths.push_back(currentNumFlowCells); + // cout << "herethread " << seqName << endl; + seqNameVector.push_back(seqName); + nameMap[seqName] = index++;//pcluster + + for(int i=0;i> intensity; + if(intensity > 9.99) { intensity = 9.99; } + int intI = int(100 * intensity + 0.0001); + flowDataIntI.push_back(intI); + } + pDataArray->m->gobble(flowFile); + } + flowFile.close(); + + int numSeqs = seqNameVector.size(); + // cout << numSeqs << endl; + for(int i=0;im->control_pressed) { return 0; } + + int iNumFlowCells = i * numFlowCells; + for(int j=lengths[i];jstartSeq;istopSeq;i++){ + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("Identifying unique flowgrams...\n"); + //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI); + /*****************************************************************************************************/ + int numUniques = 0; + uniqueFlowgrams.assign(numFlowCells * numSeqs, -1); + uniqueCount.assign(numSeqs, 0); // anWeights + uniqueLengths.assign(numSeqs, 0); + mapSeqToUnique.assign(numSeqs, -1); + mapUniqueToSeq.assign(numSeqs, -1); + + vector uniqueFlowDataIntI(numFlowCells * numSeqs, -1); + + for(int i=0;im->control_pressed) { return 0; } + + int index = 0; + + vector current(numFlowCells); + for(int j=0;j uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; } + break; + } + index++; + } + + if(index == numUniques){ + uniqueLengths[numUniques] = lengths[i]; + uniqueCount[numUniques] = 1; + mapSeqToUnique[i] = numUniques;//anMap + mapUniqueToSeq[numUniques] = i;//anF + + for(int k=0;km->control_pressed) { return 0; } + //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); + + flowDataPrI[i] = 100000000; + + for(int j=0;jm->control_pressed) { return 0; } + + float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]]; + if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; } + } + } + + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("Calculating distances between flowgrams...\n"); + string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; + unsigned long long begTime = time(NULL); + double begClock = clock(); + + //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); + /*****************************************************************************************************/ + ostringstream outStream; + outStream.setf(ios::fixed, ios::floatfield); + outStream.setf(ios::dec, ios::basefield); + outStream.setf(ios::showpoint); + outStream.precision(6); + + int thisbegTime = time(NULL); + double thisbegClock = clock(); + + for(int i=0;im->control_pressed) { break; } + + for(int j=0;jm->control_pressed) { break; } + + int flowAIntI = flowDataIntI[ANumFlowCells + k]; + float flowAPrI = flowDataPrI[ANumFlowCells + k]; + + int flowBIntI = flowDataIntI[BNumFlowCells + k]; + float flowBPrI = flowDataPrI[BNumFlowCells + k]; + flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; + } + + flowDistance /= (float) minLength; + /*****************************************************************************************************/ + + if(flowDistance < 1e-6){ + outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl; + } + else if(flowDistance <= pDataArray->cutoff){ + outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl; + } + } + if(i % 100 == 0){ + pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime)); + pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + } + + ofstream distFile(distFileName.c_str()); + distFile << outStream.str(); + distFile.close(); + + if (pDataArray->m->control_pressed) {} + else { + pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime)); + pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + /*****************************************************************************************************/ + + pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); + + string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; + //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq); + /*****************************************************************************************************/ + vector duplicateNames(numUniques, ""); + for(int i=0;im->openOutputFile(namesFileName, nameFile); + + for(int i=0;im->control_pressed) { nameFile.close(); return 0; } + nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; + } + nameFile.close(); + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("\nClustering flowgrams...\n"); + string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; + //cluster(listFileName, distFileName, namesFileName); + /*****************************************************************************************************/ + ReadMatrix* read = new ReadColumnMatrix(distFileName); + read->setCutoff(pDataArray->cutoff); + + NameAssignment* clusterNameMap = new NameAssignment(namesFileName); + clusterNameMap->readMap(); + read->read(clusterNameMap); + + ListVector* list = read->getListVector(); + SparseMatrix* matrix = read->getMatrix(); + + delete read; + delete clusterNameMap; + + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); + + Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest"); + string tag = cluster->getTag(); + + double clusterCutoff = pDataArray->cutoff; + while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){ + + if (pDataArray->m->control_pressed) { break; } + + cluster->update(clusterCutoff); + } + + list->setLabel(toString(pDataArray->cutoff)); + + ofstream listFileOut; + pDataArray->m->openOutputFile(listFileName, listFileOut); + list->print(listFileOut); + listFileOut.close(); + + delete matrix; delete cluster; delete rabund; delete list; + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { return 0; } + + vector otuData; + vector cumNumSeqs; + vector nSeqsPerOTU; + vector > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices + vector > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI + vector seqNumber; //tMaster->anP: the sequence id number sorted by OTU + vector seqIndex; //tMaster->anI; the index that corresponds to seqNumber + + + //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap); + /*****************************************************************************************************/ + ifstream listFile; + pDataArray->m->openInputFile(listFileName, listFile); + string label; + int numOTUs; + + listFile >> label >> numOTUs; + + otuData.assign(numSeqs, 0); + cumNumSeqs.assign(numOTUs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + aaP.clear();aaP.resize(numOTUs); + + seqNumber.clear(); + aaI.clear(); + seqIndex.clear(); + + string singleOTU = ""; + + for(int i=0;im->control_pressed) { break; } + + listFile >> singleOTU; + + istringstream otuString(singleOTU); + + while(otuString){ + + string seqName = ""; + + for(int j=0;j::iterator nmIt = nameMap.find(seqName); + int index = nmIt->second; + + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + seqName = ""; + } + } + + map::iterator nmIt = nameMap.find(seqName); + + int index = nmIt->second; + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + + otuString.get(); + } + + sort(aaP[i].begin(), aaP[i].end()); + for(int j=0;jm->control_pressed) { return 0; } + + pDataArray->m->mothurRemove(distFileName); + pDataArray->m->mothurRemove(namesFileName); + pDataArray->m->mothurRemove(listFileName); + + vector dist; //adDist - distance of sequences to centroids + vector change; //did the centroid sequence change? 0 = no; 1 = yes + vector centroids; //the representative flowgram for each cluster m + vector weight; + vector singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs) + vector nSeqsBreaks; + vector nOTUsBreaks; + + dist.assign(numSeqs * numOTUs, 0); + change.assign(numOTUs, 1); + centroids.assign(numOTUs, -1); + weight.assign(numOTUs, 0); + singleTau.assign(numSeqs, 1.0); + + nSeqsBreaks.assign(2, 0); + nOTUsBreaks.assign(2, 0); + + nSeqsBreaks[0] = 0; + nSeqsBreaks[1] = numSeqs; + nOTUsBreaks[1] = numOTUs; if (pDataArray->m->control_pressed) { break; } - cout << "thread i = " << i << endl; - for(int j=0;jm->mothurOut("\nDenoising flowgrams...\n"); + pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); + + while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){ + + if (pDataArray->m->control_pressed) { break; } - cout << "thread j = " << j << endl; - float flowDistance = 0.0; - ////////////////// calcPairwiseDist /////////////////// - //needed because this is a static global function that can't see the classes internal functions + double cycClock = clock(); + unsigned long long cycTime = time(NULL); + //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); + /*****************************************************************************************************/ + int indexFill = 0; + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jlengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[i]]]; - if(pDataArray->lengths[pDataArray->mapUniqueToSeq[j]] < minLength){ minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[j]]]; } + if (pDataArray->m->control_pressed) { break; } + + //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); + /*****************************************************************************************************/ + for(int i=0;im->control_pressed) { break; } + + double count = 0; + int position = 0; + int minFlowGram = 100000000; + double minFlowValue = 1e8; + change[i] = 0; //FALSE + + for(int j=0;j 0 && count > MIN_COUNT){ + vector adF(nSeqsPerOTU[i]); + vector anL(nSeqsPerOTU[i]); + + for(int j=0;jsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist = dist / (double)lengths[nI]; + /*****************************************************************************************************/ + adF[k] += dist * tauValue; + } + } + + for(int j=0;jm->control_pressed) { break; } + + //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight); + /*****************************************************************************************************/ + double maxChange = 0; + + for(int i=0;im->control_pressed) { break; } + + double difference = weight[i]; + weight[i] = 0; + + for(int j=0;j maxChange){ maxChange = difference; } + } + maxDelta = maxChange; + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { break; } + + //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); + /*****************************************************************************************************/ + vector P(numSeqs, 0); + int effNumOTUs = 0; + + for(int i=0;i MIN_WEIGHT){ + effNumOTUs++; + } + } + + string hold; + for(int i=0;im->control_pressed) { break; } + + for(int j=0;jsigma); + } + } + double nLL = 0.00; + for(int i=0;isigma); + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { break; } + + //checkCentroids(numOTUs, centroids, weight); + /*****************************************************************************************************/ + vector unique(numOTUs, 1); + + for(int i=0;im->control_pressed) { break; } + + if(unique[i] == 1){ + for(int j=i+1;jm->control_pressed) { break; } - int ANumFlowCells = pDataArray->mapUniqueToSeq[i] * pDataArray->numFlowCells; - int BNumFlowCells = pDataArray->mapUniqueToSeq[j] * pDataArray->numFlowCells; + //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths); + /*****************************************************************************************************/ + int total = 0; + vector newTau(numOTUs,0); + vector norms(numSeqs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=0;im->control_pressed) { break; } + + int indexOffset = i * numOTUs; + + double offset = 1e8; + + for(int j=0;j MIN_WEIGHT && change[j] == 1){ + //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells); + /*****************************************************************************************************/ + int flowAValue = centroids[j] * numFlowCells; + int flowBValue = i * numFlowCells; + + double distTemp = 0; + + for(int l=0;lsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist[indexOffset + j] = distTemp / (double)lengths[i]; + /*****************************************************************************************************/ + + } + + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ + offset = dist[indexOffset + j]; + } + } + + for(int j=0;j MIN_WEIGHT){ + newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j]; + norms[i] += newTau[j]; + } + else{ + newTau[j] = 0.0; + } + } + + for(int j=0;j MIN_TAU){ + + int oldTotal = total; + + total++; + + singleTau.resize(total, 0); + seqNumber.resize(total, 0); + seqIndex.resize(total, 0); + + singleTau[oldTotal] = newTau[j]; + + aaP[j][nSeqsPerOTU[j]] = oldTotal; + aaI[j][nSeqsPerOTU[j]] = i; + nSeqsPerOTU[j]++; + } + } + + } + + /*****************************************************************************************************/ + + if (pDataArray->m->control_pressed) { break; } - for(int k=0;km->control_pressed) { break; } - - int flowAIntI = pDataArray->flowDataIntI[ANumFlowCells + k]; - float flowAPrI = pDataArray->flowDataPrI[ANumFlowCells + k]; - - int flowBIntI = pDataArray->flowDataIntI[BNumFlowCells + k]; - float flowBPrI = pDataArray->flowDataPrI[BNumFlowCells + k]; - flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; - } + iter++; - flowDistance /= (float) minLength; - //cout << flowDistance << endl; - ////////////////// end of calcPairwiseDist /////////////////// - - if(flowDistance < 1e-6){ - outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << 0.000000 << endl; - } - else if(flowDistance <= pDataArray->cutoff){ - outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << flowDistance << endl; - } - } - if(i % 100 == 0){ - pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime)); - pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); - pDataArray->m->mothurOutEndLine(); - } + pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); + + } + + if (pDataArray->m->control_pressed) { break; } + + pDataArray->m->mothurOut("\nFinalizing...\n"); + //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); + /*****************************************************************************************************/ + int indexFill = 0; + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jm->control_pressed) { break; } + + //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI); + /*****************************************************************************************************/ + vector bigTauMatrix(numOTUs * numSeqs, 0.0000); + + for(int i=0;im->control_pressed) { break; } + + for(int j=0;j maxTau){ + maxTau = bigTauMatrix[i * numOTUs + j]; + maxOTU = j; + } + } + + otuData[i] = maxOTU; + } + + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jm->control_pressed) { break; } + + vector otuCounts(numOTUs, 0); + for(int i=0;im->control_pressed) { break; } + + double count = 0; + int position = 0; + int minFlowGram = 100000000; + double minFlowValue = 1e8; + change[i] = 0; //FALSE + + for(int j=0;j 0 && count > MIN_COUNT){ + vector adF(nSeqsPerOTU[i]); + vector anL(nSeqsPerOTU[i]); + + for(int j=0;jsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist = dist / (double)lengths[nI]; + /*****************************************************************************************************/ + adF[k] += dist * tauValue; + } + } + + for(int j=0;jm->control_pressed) { break; } + + //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); + if (pDataArray->m->control_pressed) { break; } + /*****************************************************************************************************/ + string thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual"; + + ofstream qualityFile; + pDataArray->m->openOutputFile(qualityFileName, qualityFile); + + qualityFile.setf(ios::fixed, ios::floatfield); + qualityFile.setf(ios::showpoint); + qualityFile << setprecision(6); + + vector > qualities(numOTUs); + vector pr(HOMOPS, 0); + + + for(int i=0;im->control_pressed) { break; } + + int index = 0; + int base = 0; + + if(nSeqsPerOTU[i] > 0){ + qualities[i].assign(1024, -1); + + while(index < numFlowCells){ + double maxPrValue = 1e8; + short maxPrIndex = -1; + double count = 0.0000; + + pr.assign(HOMOPS, 0); + + for(int j=0;jsingleLookUp[s * NUMBINS + intensity]; + } + } + + maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index]; + maxPrValue = pr[maxPrIndex]; + + if(count > MIN_COUNT){ + double U = 0.0000; + double norm = 0.0000; + + for(int s=0;s0.00){ + temp = log10(U); + } + else{ + temp = -10.1; + } + temp = floor(-10 * temp); + value = (int)floor(temp); + if(value > 100){ value = 100; } + + qualities[i][base] = (int)value; + base++; + } + } + + index++; + } + } + + + if(otuCounts[i] > 0){ + qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; + + int j=4; //need to get past the first four bases + while(qualities[i][j] != -1){ + qualityFile << qualities[i][j] << ' '; + j++; + } + qualityFile << endl; + } + } + qualityFile.close(); + pDataArray->outputNames.push_back(qualityFileName); + /*****************************************************************************************************/ + + // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids); + if (pDataArray->m->control_pressed) { break; } + /*****************************************************************************************************/ + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta"; + ofstream fastaFile; + pDataArray->m->openOutputFile(fastaFileName, fastaFile); + + vector names(numOTUs, ""); + + for(int i=0;im->control_pressed) { break; } + + int index = centroids[i]; + + if(otuCounts[i] > 0){ + fastaFile << '>' << seqNameVector[aaI[i][0]] << endl; + + string newSeq = ""; + + for(int j=0;jflowOrder[j % 4]; + for(int k=0;koutputNames.push_back(fastaFileName); + + if(pDataArray->thisCompositeFASTAFileName != ""){ + pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName); + } + + /*****************************************************************************************************/ + + //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); + if (pDataArray->m->control_pressed) { break; } + /*****************************************************************************************************/ + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names"; + ofstream nameFileOut; + pDataArray->m->openOutputFile(nameFileName, nameFileOut); + + for(int i=0;im->control_pressed) { break; } + + if(otuCounts[i] > 0){ + nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]]; + + for(int j=1;joutputNames.push_back(nameFileName); + + + if(pDataArray->thisCompositeNameFileName != ""){ + pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName); + } + /*****************************************************************************************************/ + + //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); + if (pDataArray->m->control_pressed) { break; } + /*****************************************************************************************************/ + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts"; + ofstream otuCountsFile; + pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile); + + string bases = pDataArray->flowOrder; + + for(int i=0;im->control_pressed) { + break; + } + //output the translated version of the centroid sequence for the otu + if(otuCounts[i] > 0){ + int index = centroids[i]; + + otuCountsFile << "ideal\t"; + for(int j=8;joutputNames.push_back(otuCountsFileName); + /*****************************************************************************************************/ + + //writeGroups(flowFileName, numSeqs, seqNameVector); + if (pDataArray->m->control_pressed) { break; } + /*****************************************************************************************************/ + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)); + string groupFileName = fileRoot + "shhh.groups"; + ofstream groupFile; + pDataArray->m->openOutputFile(groupFileName, groupFile); + + for(int i=0;im->control_pressed) { break; } + groupFile << seqNameVector[i] << '\t' << fileRoot << endl; + } + groupFile.close(); + pDataArray->outputNames.push_back(groupFileName); + /*****************************************************************************************************/ + + pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); } - ofstream distFile(pDataArray->distFileName.c_str()); - distFile << outStream.str(); - distFile.close(); - - if (pDataArray->m->control_pressed) {} - else { - pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime)); - pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)); - pDataArray->m->mothurOutEndLine(); - } + if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; } + + return 0; } catch(exception& e) { - pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction"); + pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction"); exit(1); } } diff --git a/suffixdb.hpp b/suffixdb.hpp index ca1ff42..3796801 100644 --- a/suffixdb.hpp +++ b/suffixdb.hpp @@ -21,19 +21,12 @@ #include "mothur.h" #include "database.hpp" #include "suffixtree.hpp" -//class SuffixTree; class SuffixDB : public Database { public: SuffixDB(int); SuffixDB(); - SuffixDB(const SuffixDB& sdb) : count(sdb.count), Database(sdb) { - for (int i = 0; i < sdb.suffixForest.size(); i++) { - SuffixTree temp(sdb.suffixForest[i]); - suffixForest.push_back(temp); - } - } ~SuffixDB(); void generateDB() {}; //adding sequences generates the db diff --git a/suffixnodes.hpp b/suffixnodes.hpp index 1e078a6..6a22c4d 100644 --- a/suffixnodes.hpp +++ b/suffixnodes.hpp @@ -25,7 +25,6 @@ class SuffixNode { public: SuffixNode(int, int, int); - SuffixNode(const SuffixNode& sn) : parentNode(sn.parentNode), startCharPosition(sn.startCharPosition), endCharPosition(sn.endCharPosition) {m = MothurOut::getInstance();} virtual ~SuffixNode() {} virtual void print(string, int) = 0; virtual void setChildren(char, int); @@ -63,7 +62,6 @@ class SuffixBranch : public SuffixNode { public: SuffixBranch(int, int, int); - SuffixBranch(const SuffixBranch& sb) : suffixNode(sb.suffixNode), childNodes(sb.childNodes), SuffixNode(sb.parentNode, sb.startCharPosition, sb.endCharPosition) {} ~SuffixBranch() {} void print(string, int); // need a special method for printing the node because there are children void eraseChild(char); // need a special method for erasing the children diff --git a/suffixtree.cpp b/suffixtree.cpp index fd18109..9cd8351 100644 --- a/suffixtree.cpp +++ b/suffixtree.cpp @@ -33,25 +33,6 @@ inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is neces return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent } -//******************************************************************************************************************** - -SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode), - nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) { - try { - m = MothurOut::getInstance(); - - for (int i = 0; i < st.nodeVector.size(); i++) { - SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i])); - nodeVector.push_back(temp); - } - - - }catch(exception& e) { - m->errorOut(e, "SuffixTree", "SuffixTree"); - exit(1); - } -} - //******************************************************************************************************************** SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); } diff --git a/suffixtree.hpp b/suffixtree.hpp index 492db54..d2b69e4 100644 --- a/suffixtree.hpp +++ b/suffixtree.hpp @@ -36,8 +36,6 @@ class SuffixTree { public: SuffixTree(); ~SuffixTree(); -// SuffixTree(string, string); - SuffixTree(const SuffixTree&); void loadSequence(Sequence); string getSeqName(); diff --git a/tree.cpp b/tree.cpp index b993735..d9d71ae 100644 --- a/tree.cpp +++ b/tree.cpp @@ -706,7 +706,7 @@ void Tree::randomLabels(string groupA, string groupB) { exit(1); } } -/**************************************************************************************************/ +**************************************************************************************************/ void Tree::randomBlengths() { try { for(int i=numNodes-1;i>=0;i--){ diff --git a/trimoligos.cpp b/trimoligos.cpp index f0b5a80..245cfdc 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -12,6 +12,28 @@ #include "needlemanoverlap.hpp" +/********************************************************************/ +//strip, pdiffs, bdiffs, primers, barcodes, revPrimers +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + ldiffs = l; + sdiffs = s; + + barcodes = br; + primers = pr; + revPrimer = r; + linker = lk; + spacer = sp; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } +} /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, map pr, map br, vector r){ @@ -69,9 +91,9 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ Alignment* alignment; if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); + map::iterator it; - for(it;it!=barcodes.end();it++){ + for(it=barcodes.begin();it!=barcodes.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -296,9 +318,9 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ Alignment* alignment; if (primers.size() > 0) { - map::iterator it=primers.begin(); + map::iterator it; - for(it;it!=primers.end();it++){ + for(it=primers.begin();it!=primers.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -375,7 +397,7 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ } } //*******************************************************************/ -int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ +int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ try { string rawSequence = seq.getUnaligned(); int success = pdiffs + 1; //guilty until proven innocent @@ -390,9 +412,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); + if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); } if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); + if (!keepForward) { qual.trimQScores(oligo.length(), -1); } } success = 0; break; @@ -408,9 +430,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ Alignment* alignment; if (primers.size() > 0) { - map::iterator it=primers.begin(); + map::iterator it; - for(it;it!=primers.end();it++){ + for(it=primers.begin();it!=primers.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -470,9 +492,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers else{ //use the best match group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); + if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); } if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); + if (!keepForward) { qual.trimQScores(minPos, -1); } } success = minDiff; } @@ -550,6 +572,129 @@ bool TrimOligos::stripReverse(Sequence& seq){ exit(1); } } +//******************************************************************/ +bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripLinker(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } +} + +//******************************************************************/ +bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripSpacer(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent + + for(int i=0;ierrorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } +} //******************************************************************/ bool TrimOligos::compareDNASeq(string oligo, string seq){ diff --git a/trimoligos.h b/trimoligos.h index 8830dff..e3ea7d5 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -19,25 +19,34 @@ class TrimOligos { public: - TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers + TrimOligos(int,int, map, map, vector); //pdiffs, bdiffs, primers, barcodes, revPrimers + TrimOligos(int,int, int, int, map, map, vector, vector, vector); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer ~TrimOligos(); int stripBarcode(Sequence&, int&); int stripBarcode(Sequence&, QualityScores&, int&); int stripForward(Sequence&, int&); - int stripForward(Sequence&, QualityScores&, int&); + int stripForward(Sequence&, QualityScores&, int&, bool); bool stripReverse(Sequence&); bool stripReverse(Sequence&, QualityScores&); + + bool stripLinker(Sequence&); + bool stripLinker(Sequence&, QualityScores&); + + bool stripSpacer(Sequence&); + bool stripSpacer(Sequence&, QualityScores&); private: - int pdiffs, bdiffs; + int pdiffs, bdiffs, ldiffs, sdiffs; map barcodes; map primers; vector revPrimer; + vector linker; + vector spacer; MothurOut* m; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index dd3427b..ae9f707 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -25,9 +25,12 @@ vector TrimSeqsCommand::setParameters(){ CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength); CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); - CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles); + CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward); CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim); CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold); CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage); @@ -64,9 +67,11 @@ string TrimSeqsCommand::getHelpString(){ helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; helpString += "The maxlength parameter allows you to set and maximum sequence length. \n"; - helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; helpString += "The qfile parameter allows you to provide a quality file.\n"; helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n"; helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n"; @@ -75,6 +80,7 @@ string TrimSeqsCommand::getHelpString(){ helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n"; helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n"; helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; + helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n"; helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n"; helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n"; helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n"; @@ -229,11 +235,17 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); - temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } m->mothurConvert(temp, tdiffs); - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } @@ -274,6 +286,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; } + keepforward = m->isTrue(temp); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); @@ -545,7 +560,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; - TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); while (moreSeqs) { @@ -578,14 +593,24 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int barcodeIndex = 0; int primerIndex = 0; + if(numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(!success) { trashCode += 'k'; } + } + if(barcodes.size() != 0){ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } + if(numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(!success) { trashCode += 's'; } + } + if(numFPrimers != 0){ - success = trimOligos.stripForward(currSeq, currQual, primerIndex); + success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward); if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -1101,6 +1126,10 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< barcodes[oligo]=indexBarcode; indexBarcode++; barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); } else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } @@ -1192,6 +1221,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } numFPrimers = primers.size(); numRPrimers = revPrimer.size(); + numLinkers = linker.size(); + numSpacers = spacer.size(); bool allBlank = true; for (int i = 0; i < barcodeNameVector.size(); i++) { diff --git a/trimseqscommand.h b/trimseqscommand.h index 137cb73..b6050f2 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -52,8 +52,8 @@ private: bool abort, createGroup; string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir; - bool flip, allFiles, qtrim; - int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, comboStarts; + bool flip, allFiles, qtrim, keepforward; + int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; int qWindowSize, qWindowStep, keepFirst, removeLast; double qRollAverage, qThreshold, qWindowAverage, qAverage; vector revPrimer, outputNames; @@ -61,6 +61,8 @@ private: map barcodes; vector groupVector; map primers; + vector linker; + vector spacer; map combos; map groupToIndex; vector primerNameVector; //needed here? -- 2.39.2