From e0dc0bcef2a0f7e1f63abb531dbb1ad533da98ca Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 11 Nov 2011 14:17:31 +0000 Subject: [PATCH] fixed bug with trim.flows that was adding flow files names to the .flow.files file more than once. working on shhh.seqs command, finished chimera.perseus command. Fixed bug with chimera.uchime relating to uchime executable location. added column headings to chimera.uchime output. --- Mothur.xcodeproj/project.pbxproj | 18 + chimeraperseuscommand.cpp | 16 +- chimerauchimecommand.cpp | 43 +- chimerauchimecommand.h | 22 +- commandfactory.cpp | 7 +- distancecommand.h | 2 +- metastatscommand.cpp | 6 +- mothur.h | 1 + myPerseus.cpp | 2 +- myseqdist.cpp | 416 ++++++++++++++ myseqdist.h | 53 ++ seqnoise.cpp | 959 +++++++++++++++++++++++++++++++ seqnoise.h | 71 +++ sharedrabundvector.cpp | 4 +- shhhseqscommand.cpp | 681 ++++++++++++++++++++++ shhhseqscommand.h | 332 +++++++++++ trimflowscommand.cpp | 44 +- 17 files changed, 2623 insertions(+), 54 deletions(-) create mode 100644 myseqdist.cpp create mode 100644 myseqdist.h create mode 100644 seqnoise.cpp create mode 100644 seqnoise.h create mode 100644 shhhseqscommand.cpp create mode 100644 shhhseqscommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index a756931..ff6df12 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -43,6 +43,9 @@ A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; }; A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7730EFE13967241007433A3 /* countseqscommand.cpp */; }; + A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774101314695AF60098E6AC /* shhhseqscommand.cpp */; }; + A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; }; + A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; }; A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; }; A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; }; A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; }; @@ -420,6 +423,12 @@ A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = ""; }; A7730EFD13967241007433A3 /* countseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countseqscommand.h; sourceTree = ""; }; A7730EFE13967241007433A3 /* countseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countseqscommand.cpp; sourceTree = ""; }; + A774101214695AF60098E6AC /* shhhseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shhhseqscommand.h; sourceTree = ""; }; + A774101314695AF60098E6AC /* shhhseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shhhseqscommand.cpp; sourceTree = ""; }; + A774104614696F320098E6AC /* myseqdist.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myseqdist.cpp; sourceTree = ""; }; + A774104714696F320098E6AC /* myseqdist.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myseqdist.h; sourceTree = ""; }; + A77410F414697C300098E6AC /* seqnoise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqnoise.cpp; sourceTree = ""; }; + A77410F514697C300098E6AC /* seqnoise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqnoise.h; sourceTree = ""; }; A778FE69134CA6CA00C0BA33 /* getcommandinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcommandinfocommand.h; sourceTree = ""; }; A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = ""; }; A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = ""; }; @@ -1088,6 +1097,8 @@ A7E9B75C12D37EC400DA6239 /* mothur.h */, A7E9B75D12D37EC400DA6239 /* mothurout.cpp */, A7E9B75E12D37EC400DA6239 /* mothurout.h */, + A774104714696F320098E6AC /* myseqdist.h */, + A774104614696F320098E6AC /* myseqdist.cpp */, A7E9B76112D37EC400DA6239 /* nast.cpp */, A7E9B76212D37EC400DA6239 /* nast.hpp */, A7E9B76312D37EC400DA6239 /* nastreport.cpp */, @@ -1112,6 +1123,8 @@ A7E9B7AD12D37EC400DA6239 /* rarefactioncurvedata.h */, 7E6BE10812F710D8007ADDBE /* refchimeratest.h */, 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */, + A77410F514697C300098E6AC /* seqnoise.h */, + A77410F414697C300098E6AC /* seqnoise.cpp */, A7E9BA5312D39A5E00DA6239 /* read */, A7E9B82312D37EC400DA6239 /* sharedutilities.cpp */, A7E9B82412D37EC400DA6239 /* sharedutilities.h */, @@ -1404,6 +1417,8 @@ A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */, A7E9B82812D37EC400DA6239 /* shhhercommand.h */, A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */, + A774101214695AF60098E6AC /* shhhseqscommand.h */, + A774101314695AF60098E6AC /* shhhseqscommand.cpp */, A7E9B84012D37EC400DA6239 /* splitabundcommand.h */, A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */, A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */, @@ -2167,6 +2182,9 @@ A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */, A7BF221414587886000AD524 /* myPerseus.cpp in Sources */, A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */, + A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */, + A774104814696F320098E6AC /* myseqdist.cpp in Sources */, + A77410F614697C300098E6AC /* seqnoise.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraperseuscommand.cpp b/chimeraperseuscommand.cpp index 240ba65..61970d5 100644 --- a/chimeraperseuscommand.cpp +++ b/chimeraperseuscommand.cpp @@ -39,7 +39,7 @@ string ChimeraPerseusCommand::getHelpString(){ helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n"; helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; - helpString += "The name parameter allows you to provide a name file associated with your fasta file. If none is given unique.seqs will be run to generate one. \n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; @@ -640,7 +640,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque vector chimeras(numSeqs, 0); for(int i=0;icontrol_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } vector restricted = chimeras; @@ -657,7 +656,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque vector alignments(numSeqs); int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted); - cout << "comparisons = " << comparisons << endl; if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi; @@ -666,7 +664,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque if(comparisons >= 2){ minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted); - cout << "minMismatchToChimera = " << minMismatchToChimera << endl; if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } int minMismatchToTrimera = numeric_limits::max(); @@ -674,12 +671,11 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque if(minMismatchToChimera >= 3 && comparisons >= 3){ minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted); - cout << "minMismatchToTrimera = " << minMismatchToTrimera << endl; if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } } double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel); - cout << "singleDist = " << singleDist << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } string type; @@ -693,16 +689,16 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector& seque type = "chimera"; chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps); } - cout << "chimeraRefSeq = " << chimeraRefSeq << endl; + ; if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel); - cout << "chimeraDist = " << chimeraDist << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq); double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix); - cout << "loonIndex = " << loonIndex << endl; + if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; } chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t'; @@ -841,7 +837,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string //Wait until all threads have terminated. WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); - cout << "here" << endl; + //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index ee7add9..54b1d9b 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -560,12 +560,19 @@ int ChimeraUchimeCommand::execute(){ int numSeqs = 0; int numChimeras = 0; - //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); } else{ numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); } - //#else - // numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); - //#endif + + //add headings + ofstream out; + m->openOutputFile(outputFileName+".temp", out); + out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n"; + out.close(); + + m->appendFiles(outputFileName, outputFileName+".temp"); + m->mothurRemove(outputFileName); rename((outputFileName+".temp").c_str(), outputFileName.c_str()); + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } //remove file made for uchime @@ -653,6 +660,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp ofstream out; m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n"; float temp1; string parent1, parent2, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, flag; @@ -985,17 +993,24 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc alns = "\"" + alns + "\""; vector cPara; - - char* tempUchime; + + string path = m->argv; + string tempPath = path; + for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); } + path = path.substr(0, (tempPath.find_last_of('m'))); + + string uchimeCommand = path; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - tempUchime= new char[10]; - *tempUchime = '\0'; - strncat(tempUchime, "./uchime ", 9); + uchimeCommand += "uchime "; #else - tempUchime= new char[8]; - *tempUchime = '\0'; - strncat(tempUchime, "uchime ", 7); + uchimeCommand += "uchime"; + uchimeCommand = "\"" + uchimeCommand + "\""; #endif + + char* tempUchime; + tempUchime= new char[uchimeCommand.length()+1]; + *tempUchime = '\0'; + strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length()); cPara.push_back(tempUchime); char* tempIn = new char[8]; @@ -1223,6 +1238,10 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc //uchime_main(numArgs, uchimeParameters); //cout << "commandString = " << commandString << endl; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else + commandString = "\"" + commandString + "\""; +#endif system(commandString.c_str()); //free memory diff --git a/chimerauchimecommand.h b/chimerauchimecommand.h index 359b68c..6e1f809 100644 --- a/chimerauchimecommand.h +++ b/chimerauchimecommand.h @@ -48,6 +48,7 @@ private: string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract; int processors; + vector outputNames; vector fastaFileNames; vector nameFileNames; @@ -178,10 +179,23 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ vector cPara; + string path = pDataArray->m->argv; + string tempPath = path; + for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); } + path = path.substr(0, (tempPath.find_last_of('m'))); + + string uchimeCommand = path; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + uchimeCommand += "uchime "; +#else + uchimeCommand += "uchime"; + uchimeCommand = "\"" + uchimeCommand + "\""; +#endif + char* tempUchime; - tempUchime= new char[8]; + tempUchime= new char[uchimeCommand.length()+1]; *tempUchime = '\0'; - strncat(tempUchime, "uchime ", 7); + strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length()); cPara.push_back(tempUchime); char* tempIn = new char[8]; @@ -396,6 +410,10 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){ //uchime_main(numArgs, uchimeParameters); //cout << "commandString = " << commandString << endl; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else + commandString = "\"" + commandString + "\""; +#endif system(commandString.c_str()); //free memory diff --git a/commandfactory.cpp b/commandfactory.cpp index 69bb568..364c5ed 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -124,6 +124,7 @@ #include "clearmemorycommand.h" #include "summarytaxcommand.h" #include "chimeraperseuscommand.h" +#include "shhhseqscommand.h" /*******************************************************/ @@ -268,7 +269,8 @@ CommandFactory::CommandFactory(){ commands["shhh.flows"] = "MPIEnabled"; commands["sens.spec"] = "sens.spec"; commands["seq.error"] = "seq.error"; - commands["seq.error"] = "summary.tax"; + commands["summary.tax"] = "summary.tax"; + commands["shhh.seqs"] = "shhh.seqs"; commands["quit"] = "MPIEnabled"; @@ -428,6 +430,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } else if(commandName == "chimera.perseus") { command = new ChimeraPerseusCommand(optionString); } + else if(commandName == "shhh.seqs") { command = new ShhhSeqsCommand(optionString); } else { command = new NoCommand(optionString); } return command; @@ -570,6 +573,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } else if(commandName == "chimera.perseus") { pipecommand = new ChimeraPerseusCommand(optionString); } + else if(commandName == "shhh.seqs") { pipecommand = new ShhhSeqsCommand(optionString); } else { pipecommand = new NoCommand(optionString); } return pipecommand; @@ -700,6 +704,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } else if(commandName == "chimera.perseus") { shellcommand = new ChimeraPerseusCommand(); } + else if(commandName == "shhh.seqs") { shellcommand = new ShhhSeqsCommand(); } else { shellcommand = new NoCommand(); } return shellcommand; diff --git a/distancecommand.h b/distancecommand.h index 66d8af1..f55f714 100644 --- a/distancecommand.h +++ b/distancecommand.h @@ -24,7 +24,7 @@ //custom data structure for threads to use. // This is passed by void pointer so it can be any data type // that can be passed using a single void pointer (LPVOID). -typedef struct distanceData { +struct distanceData { int startLine; int endLine; string dFileName; diff --git a/metastatscommand.cpp b/metastatscommand.cpp index 975924b..0ff8b3f 100644 --- a/metastatscommand.cpp +++ b/metastatscommand.cpp @@ -394,7 +394,7 @@ int MetaStatsCommand::driver(int start, int num, vector& th //get set names string setA = namesOfGroupCombos[c][0]; string setB = namesOfGroupCombos[c][1]; - cout << setA << '\t' << setB << endl; + //cout << setA << '\t' << setB << endl; //get filename string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats"; outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName); @@ -425,8 +425,8 @@ int MetaStatsCommand::driver(int start, int num, vector& th } } - for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; } - cout << setACount << endl; + //for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; } + //cout << setACount << endl; if ((setACount == 0) || (setBCount == 0)) { m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine(); diff --git a/mothur.h b/mothur.h index f8ec323..57ece67 100644 --- a/mothur.h +++ b/mothur.h @@ -85,6 +85,7 @@ using namespace std; #define isnan(x) ((x) != (x)) #define isinf(x) (fabs(x) == std::numeric_limits::infinity()) + typedef unsigned long ull; struct IntNode { diff --git a/myPerseus.cpp b/myPerseus.cpp index b342393..3cf38cd 100644 --- a/myPerseus.cpp +++ b/myPerseus.cpp @@ -548,7 +548,7 @@ int Perseus::getChimera(vector sequences, rightParent = -1; breakPoint = -1; - for(int l=0;lcontrol_pressed) { return 0; } diff --git a/myseqdist.cpp b/myseqdist.cpp new file mode 100644 index 0000000..f5c8b40 --- /dev/null +++ b/myseqdist.cpp @@ -0,0 +1,416 @@ +/* + * pds.seqdist.cpp + * + * + * Created by Pat Schloss on 8/12/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "myseqdist.h" +#include "sequence.hpp" + +/**************************************************************************************************/ +correctDist::correctDist(int p) : processors(p) { + try { + m = MothurOut::getInstance(); + } + catch(exception& e) { + m->errorOut(e, "correctDist", "correctDist"); + exit(1); + } +} +/**************************************************************************************************/ +correctDist::correctDist(string sequenceFileName, int p) : processors(p) { + try { + m = MothurOut::getInstance(); + getSequences(sequenceFileName); + } + catch(exception& e) { + m->errorOut(e, "correctDist", "correctDist"); + exit(1); + } +} +/**************************************************************************************************/ +int correctDist::addSeq(string seqName, string seqSeq){ + try { + names.push_back(seqName); + sequences.push_back(fixSequence(seqSeq)); + return 0; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "addSeq"); + exit(1); + } +} +/**************************************************************************************************/ +int correctDist::execute(string distanceFileName){ + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else + processors = 1; +#endif + correctMatrix.resize(4); + for(int i=0;i<4;i++){ correctMatrix[i].resize(4); } + + correctMatrix[0][0] = 0.000000; //AA + correctMatrix[1][0] = 11.619259; //CA + correctMatrix[2][0] = 11.694004; //TA + correctMatrix[3][0] = 7.748623; //GA + + correctMatrix[1][1] = 0.000000; //CC + correctMatrix[2][1] = 7.619657; //TC + correctMatrix[3][1] = 12.852562; //GC + + correctMatrix[2][2] = 0.000000; //TT + correctMatrix[3][2] = 10.964048; //TG + + correctMatrix[3][3] = 0.000000; //GG + + for(int i=0;i<4;i++){ + for(int j=0;jerrorOut(e, "correctDist", "execute"); + exit(1); + } +} +/**************************************************************************************************/ +int correctDist::getSequences(string sequenceFileName){ + try { + ifstream sequenceFile; + m->openInputFile(sequenceFileName, sequenceFile); + string seqName, seqSeq; + + while(!sequenceFile.eof()){ + if (m->control_pressed) { break; } + + Sequence temp(sequenceFile); m->gobble(sequenceFile); + + if (temp.getName() != "") { + names.push_back(temp.getName()); + sequences.push_back(fixSequence(temp.getAligned())); + } + } + sequenceFile.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "getSequences"); + exit(1); + } +} + +/**************************************************************************************************/ +vector correctDist::fixSequence(string sequence){ + try { + int alignLength = sequence.length(); + + vector seqVector; + + for(int i=0;ierrorOut(e, "correctDist", "fixSequence"); + exit(1); + } +} + +/**************************************************************************************************/ + +int correctDist::createProcess(string distanceFileName){ + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 1; + vector processIDs; + + while(process != processors){ + + int pid = fork(); + + if(pid > 0){ + processIDs.push_back(pid); + process++; + } + else if(pid == 0){ + driver(start[process], end[process], distanceFileName + toString(getpid()) + ".temp"); + exit(0); + } + else{ + cout << "Doh!" << endl; + for (int i=0;iappendFiles((distanceFileName + toString(processIDs[i]) + ".temp"), distanceFileName); + remove((distanceFileName + toString(processIDs[i]) + ".temp").c_str()); + } +#endif + return 0; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "createProcess"); + exit(1); + } +} + +/**************************************************************************************************/ + +int correctDist::driver(int start, int end, string distFileName){ + try { + ofstream distFile; + m->openOutputFile(distFileName, distFile); + distFile << setprecision(9); + + if(start == 0){ + distFile << numSeqs << endl; + } + + int startTime = time(NULL); + + m->mothurOut("\nCalculating distances...\n"); + + for(int i=start;icontrol_pressed) { distFile.close(); return 0; } + + double dist = getDist(sequences[i], sequences[j]); + + distFile << ' ' << dist; + } + distFile << endl; + + if(i % 100 == 0){ m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); } + } + distFile.close(); + + if((end-1) % 100 != 0){ m->mothurOut(toString(end-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); } + m->mothurOut("Done.\n"); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "driver"); + exit(1); + } +} +/**************************************************************************************************/ +double correctDist::getDist(vector& seqA, vector& seqB){ + try { + + int lengthA = seqA.size(); + int lengthB = seqB.size(); + + vector > alignMatrix(lengthA+1); + vector > alignMoves(lengthA+1); + + for(int i=0;i<=lengthA;i++){ + alignMatrix[i].resize(lengthB+1, 0); + alignMoves[i].resize(lengthB+1, 'x'); + } + + for(int i=0;i<=lengthA;i++){ + alignMatrix[i][0] = 15.0 * i; + alignMoves[i][0] = 'u'; + } + for(int i=0;i<=lengthB;i++){ + alignMatrix[0][i] = 15.0 * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=lengthA;i++){ + for(int j=1;j<=lengthB;j++){ + + if (m->control_pressed) { return 0; } + + double nogap; + nogap = alignMatrix[i-1][j-1] + correctMatrix[seqA[i-1]][seqB[j-1]]; + + + double gap; + double left; + if(i == lengthA){ //terminal gap + left = alignMatrix[i][j-1]; + } + else{ + if(seqB[j-1] == getLastMatch('l', alignMoves, i, j, seqA, seqB)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + left = alignMatrix[i][j-1] + gap; + } + + + double up; + if(j == lengthB){ //terminal gap + up = alignMatrix[i-1][j]; + } + else{ + + if(seqA[i-1] == getLastMatch('u', alignMoves, i, j, seqA, seqB)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + up = alignMatrix[i-1][j] + gap; + } + + + + if(nogap < left){ + if(nogap < up){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogap; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + else{ + if(left < up){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = left; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + } + } + + int i = lengthA; + int j = lengthB; + int count = 0; + + + // string alignA = ""; + // string alignB = ""; + // string bases = "ACTG"; + // + // for(int i=0;i 0 && j > 0){ + + if (m->control_pressed) { return 0; } + + if(alignMoves[i][j] == 'd'){ + // alignA = bases[seqA[i-1]] + alignA; + // alignB = bases[seqB[j-1]] + alignB; + + count++; + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + if(j != lengthB){ + // alignA = bases[seqA[i-1]] + alignA; + // alignB = '-' + alignB; + count++; + } + + i--; + } + else if(alignMoves[i][j] == 'l'){ + if(i != lengthA){ + // alignA = '-' + alignA; + // alignB = bases[seqB[j-1]] + alignB; + count++; + } + + j--; + } + } + + // cout << alignA << endl << alignB << endl; + + return alignMatrix[lengthA][lengthB] / (double)count; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "getDist"); + exit(1); + } +} +/**************************************************************************************************/ +int correctDist::getLastMatch(char direction, vector >& alignMoves, int i, int j, vector& seqA, vector& seqB){ + try { + + char nullReturn = -1; + + while(i>=1 && j>=1){ + + if (m->control_pressed) { return nullReturn; } + + if(direction == 'd'){ + if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; } + else { return nullReturn; } + } + + else if(direction == 'l') { j--; } + else { i--; } + + direction = alignMoves[i][j]; + } + + return nullReturn; + } + catch(exception& e) { + m->errorOut(e, "correctDist", "getLastMatch"); + exit(1); + } +} +/**************************************************************************************************/ + + + diff --git a/myseqdist.h b/myseqdist.h new file mode 100644 index 0000000..74c9ea7 --- /dev/null +++ b/myseqdist.h @@ -0,0 +1,53 @@ +#ifndef CORRECTDIST_H +#define CORRECTDIST_H + + +/* + * pds.seqdist.h + * + * + * Created by Pat Schloss on 8/12/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothurout.h" + +/**************************************************************************************************/ + +class correctDist { +public: + correctDist(string, int); + correctDist(int); + ~correctDist(){} + + int addSeq(string, string); + int execute(string); + +private: + MothurOut* m; + int getSequences(string); + vector fixSequence(string); + + int driver(int, int, string); + int createProcess(string); + + double getDist(vector&, vector&); + int getLastMatch(char, vector >&, int, int, vector&, vector&); + + vector > correctMatrix; + + vector > sequences; + + vector names; + int numSeqs; + int processors; + vector start; + vector end; +}; + +/**************************************************************************************************/ + +#endif + + diff --git a/seqnoise.cpp b/seqnoise.cpp new file mode 100644 index 0000000..578daaf --- /dev/null +++ b/seqnoise.cpp @@ -0,0 +1,959 @@ +/* + * mySeqNoise.cpp + * + * + * Created by Pat Schloss on 8/31/11. + * Copyright 2011 Patrick D. Schloss. All rights reserved. + * + */ + +#include "seqnoise.h" +#include "sequence.hpp" + +#define MIN_DELTA 1.0e-6 +#define MIN_ITER 20 +#define MAX_ITER 1000 +#define MIN_COUNT 0.1 +#define MIN_TAU 1.0e-4 +#define MIN_WEIGHT 0.1 + + +/**************************************************************************************************/ +int seqNoise::getSequenceData(string sequenceFileName, vector& sequences){ + try { + + ifstream sequenceFile; + m->openInputFile(sequenceFileName, sequenceFile); + + while(!sequenceFile.eof()){ + + if (m->control_pressed) { break; } + + Sequence temp(sequenceFile); m->gobble(sequenceFile); + + if (temp.getName() != "") { + sequences.push_back(temp.getAligned()); + } + } + sequenceFile.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "getSequenceData"); + exit(1); + } +} +/**************************************************************************************************/ +int seqNoise::addSeq(string seq, vector& sequences){ + try { + sequences.push_back(seq); + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "addSeq"); + exit(1); + } +} +/**************************************************************************************************/ +//no checks for file mismatches +int seqNoise::getRedundantNames(string namesFileName, vector& uniqueNames, vector& redundantNames, vector& seqFreq){ + try { + string unique, redundant; + ifstream namesFile; + m->openInputFile(namesFileName, namesFile); + + for(int i=0;icontrol_pressed) { break; } + + namesFile >> uniqueNames[i]; m->gobble(namesFile); + namesFile >> redundantNames[i]; m->gobble(namesFile); + + seqFreq[i] = m->getNumNames(redundantNames[i]); + } + namesFile.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "getRedundantNames"); + exit(1); + } +} +/**************************************************************************************************/ +int seqNoise::addRedundantName(string uniqueName, string redundantName, vector& uniqueNames, vector& redundantNames, vector& seqFreq){ + try { + + uniqueNames.push_back(uniqueName); + redundantNames.push_back(redundantName); + seqFreq.push_back(m->getNumNames(redundantName)); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "addRedundantName"); + exit(1); + } +} +/**************************************************************************************************/ +int seqNoise::getDistanceData(string distFileName, vector& distances){ + try { + + ifstream distFile; + m->openInputFile(distFileName, distFile); + + int numSeqs = 0; + string name = ""; + + distFile >> numSeqs; + + for(int i=0;icontrol_pressed) { break; } + + distances[i * numSeqs + i] = 0.0000; + + distFile >> name; + + for(int j=0;j> distances[i * numSeqs + j]; + distances[j * numSeqs + i] = distances[i * numSeqs + j]; + } + } + + distFile.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "getDistanceData"); + exit(1); + } +} + +/**************************************************************************************************/ +int seqNoise::getListData(string listFileName, double cutOff, vector& otuData, vector& otuFreq, vector >& otuBySeqLookUp){ + try { + + ifstream listFile; + m->openInputFile(listFileName, listFile); + double threshold; + int numOTUs; + + if(listFile.peek() == 'u'){ m->getline(listFile); } + + while(listFile){ + listFile >> threshold; + + if(threshold < cutOff){ + m->getline(listFile); + } + else{ + listFile >> numOTUs; + otuFreq.resize(numOTUs, 0); + + for(int i=0;icontrol_pressed) { return 0; } + + string otu; + listFile >> otu; + + int count = 0; + + string number = ""; + + for(int j=0;jcontrol_pressed) { return 0; } + otuBySeqLookUp[otuData[i]].push_back(i); + } + for(int i=0;icontrol_pressed) { return 0; } + for(int j=otuBySeqLookUp[i].size();jerrorOut(e, "seqNoise", "getListData"); + exit(1); + } +} + +/**************************************************************************************************/ +int seqNoise::updateOTUCountData(vector otuFreq, + vector > otuBySeqLookUp, + vector > aanI, + vector& anP, + vector& anI, + vector& cumCount + ){ + try { + int numOTUs = otuFreq.size(); + + int count = 0; + + for(int i=0;icontrol_pressed) { return 0; } + + for(int j=0;jerrorOut(e, "seqNoise", "updateOTUCountData"); + exit(1); + } +} +/**************************************************************************************************/ +double seqNoise::calcNewWeights( + vector& weights, // + vector seqFreq, // + vector anI, // + vector cumCount, // + vector anP, // + vector otuFreq, // + vector tau // + ){ + try { + + int numOTUs = weights.size(); + double maxChange = -1; + + cout.flush(); + + for(int i=0;icontrol_pressed) { return 0; } + + double change = weights[i]; + + weights[i] = 0.0000; + + for(int j=0;j maxChange){ maxChange = change; } + cout.flush(); + } + return maxChange; + + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "calcNewWeights"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::calcCentroids( + vector anI, + vector anP, + vector& change, + vector& centroids, + vector cumCount, + vector distances,/// + vector seqFreq, + vector otuFreq, + vector tau + ){ + try { + int numOTUs = change.size(); + int numSeqs = seqFreq.size(); + + for(int i=0;icontrol_pressed) { return 0; } + + int minFIndex = -1; + double minFValue = 1e10; + + change[i] = 0; + double count = 0.00000; + + int freqOfOTU = otuFreq[i]; + + for(int j=0;j 0 && count > MIN_COUNT){ + + vector adF(freqOfOTU); + vector anL(freqOfOTU); + + for(int j=0;jerrorOut(e, "seqNoise", "calcCentroids"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::checkCentroids(vector& weights, vector centroids){ + try { + int numOTUs = centroids.size(); + vector unique(numOTUs, 1); + + double minWeight = MIN_WEIGHT; + for(int i=0;icontrol_pressed) { return 0; } + if(weights[i] < minWeight){ unique[i] = -1; } + } + + for(int i=0;icontrol_pressed) { return 0; } + if(unique[i] == 1){ + for(int j=i+1; jerrorOut(e, "seqNoise", "checkCentroids"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::setUpOTUData(vector& otuData, vector& percentage, vector cumCount, vector tau, vector otuFreq, vector anP, vector anI){ + try { + + int numOTUs = cumCount.size(); + int numSeqs = otuData.size(); + + vector bestTau(numSeqs, 0); + vector bestIndex(numSeqs, -1); + + for(int i=0;icontrol_pressed) { return 0; } + for(int j=0;j bestTau[index2]){ + bestTau[index2] = thisTau; + bestIndex[index2] = i; + } + } + } + + for(int i=0;icontrol_pressed) { return 0; } + otuData[i] = bestIndex[i]; + percentage[i] = 1 - bestTau[i]; + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "setUpOTUData"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::finishOTUData(vector otuData, vector& otuFreq, vector& anP, vector& anI, vector& cumCount, vector >& otuBySeqLookUp, vector >& aanI, vector& tau){ + try { + int numSeqs = otuData.size(); + int numOTUs = otuFreq.size(); + int total = numSeqs; + + otuFreq.assign(numOTUs, 0); + tau.assign(numSeqs, 1); + anP.assign(numSeqs, 0); + anI.assign(numSeqs, 0); + + for(int i=0;icontrol_pressed) { return 0; } + int otu = otuData[i]; + total++; + + otuBySeqLookUp[otu][otuFreq[otu]] = i; + aanI[otu][otuFreq[otu]] = i; + otuFreq[otu]++; + } + updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "finishOTUData"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::getLastMatch(char direction, vector >& alignMoves, int i, int j, vector& seqA, vector& seqB){ + try{ + char nullReturn = -1; + + while(i>=1 && j>=1){ + if (m->control_pressed) { return nullReturn; } + if(direction == 'd'){ + if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; } + else { return nullReturn; } + } + + else if(direction == 'l') { j--; } + else { i--; } + + direction = alignMoves[i][j]; + } + + return nullReturn; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "getLastMatch"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::countDiffs(vector query, vector ref){ + try { + //double MATCH = 5.0; + //double MISMATCH = -2.0; + //double GAP = -2.0; + + vector > correctMatrix(4); + for(int i=0;i<4;i++){ correctMatrix[i].resize(4); } + + correctMatrix[0][0] = 0.000000; //AA + correctMatrix[1][0] = 11.619259; //CA + correctMatrix[2][0] = 11.694004; //TA + correctMatrix[3][0] = 7.748623; //GA + + correctMatrix[1][1] = 0.000000; //CC + correctMatrix[2][1] = 7.619657; //TC + correctMatrix[3][1] = 12.852562; //GC + + correctMatrix[2][2] = 0.000000; //TT + correctMatrix[3][2] = 10.964048; //TG + + correctMatrix[3][3] = 0.000000; //GG + + for(int i=0;i<4;i++){ + for(int j=0;j > alignMatrix(queryLength + 1); + vector > alignMoves(queryLength + 1); + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i].resize(refLength + 1, 0); + alignMoves[i].resize(refLength + 1, 'x'); + } + + for(int i=0;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[i][0] = 15.0 * i; + alignMoves[i][0] = 'u'; + } + + for(int i=0;i<=refLength;i++){ + if (m->control_pressed) { return 0; } + alignMatrix[0][i] = 15.0 * i; + alignMoves[0][i] = 'l'; + } + + for(int i=1;i<=queryLength;i++){ + if (m->control_pressed) { return 0; } + for(int j=1;j<=refLength;j++){ + + double nogap; + nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]]; + + + double gap; + double left; + if(i == queryLength){ //terminal gap + left = alignMatrix[i][j-1]; + } + else{ + if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + left = alignMatrix[i][j-1] + gap; + } + + + double up; + if(j == refLength){ //terminal gap + up = alignMatrix[i-1][j]; + } + else{ + + if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){ + gap = 4.0; + } + else{ + gap = 15.0; + } + + up = alignMatrix[i-1][j] + gap; + } + + + + if(nogap < left){ + if(nogap < up){ + alignMoves[i][j] = 'd'; + alignMatrix[i][j] = nogap; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + else{ + if(left < up){ + alignMoves[i][j] = 'l'; + alignMatrix[i][j] = left; + } + else{ + alignMoves[i][j] = 'u'; + alignMatrix[i][j] = up; + } + } + } + } + + int i = queryLength; + int j = refLength; + int diffs = 0; + + // string alignA = ""; + // string alignB = ""; + // string bases = "ACTG"; + + while(i > 0 && j > 0){ + if (m->control_pressed) { return 0; } + if(alignMoves[i][j] == 'd'){ + // alignA = bases[query[i-1]] + alignA; + // alignB = bases[ref[j-1]] + alignB; + + if(query[i-1] != ref[j-1]) { diffs++; } + + i--; + j--; + } + else if(alignMoves[i][j] == 'u'){ + if(j != refLength){ + // alignA = bases[query[i-1]] + alignA; + // alignB = '-' + alignB; + + diffs++; + } + + i--; + } + else if(alignMoves[i][j] == 'l'){ + if(i != queryLength){ + // alignA = '-' + alignA; + // alignB = bases[ref[j-1]] + alignB; + + diffs++; + } + + j--; + } + } + + // cout << diffs << endl; + // cout << alignA << endl; + // cout << alignB << endl; + // cout << endl; + + return diffs; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "countDiffs"); + exit(1); + } + +} + +/**************************************************************************************************/ + +vector seqNoise::convertSeq(string bases){ + try { + vector numbers(bases.length(), -1); + + for(int i=0;icontrol_pressed) { return numbers; } + + char b = bases[i]; + + if(b == 'A') { numbers[i] = 0; } + else if(b=='C') { numbers[i] = 1; } + else if(b=='T') { numbers[i] = 2; } + else if(b=='G') { numbers[i] = 3; } + else { numbers[i] = 0; } + } + + return numbers; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "convertSeq"); + exit(1); + } +} + +/**************************************************************************************************/ + +string seqNoise::degapSeq(string aligned){ + try { + string unaligned = ""; + + for(int i=0;icontrol_pressed) { return ""; } + + if(aligned[i] != '-' && aligned[i] != '.'){ + unaligned += aligned[i]; + } + } + + return unaligned; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "degapSeq"); + exit(1); + } +} + +/**************************************************************************************************/ + +int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector finalTau, vector centroids, vector otuData, vector sequences, vector uniqueNames, vector redundantNames, vector seqFreq, vector& distances){ + try { + int numOTUs = finalTau.size(); + int numSeqs = uniqueNames.size(); + + ofstream fastaFile(fastaFileName.c_str()); + ofstream namesFile(namesFileName.c_str()); + ofstream uMapFile(uMapFileName.c_str()); + + vector maxSequenceAbund(numOTUs, 0); + vector maxSequenceIndex(numOTUs, 0); + + for(int i=0;icontrol_pressed) { return 0; } + if(maxSequenceAbund[otuData[i]] < seqFreq[i]){ + maxSequenceAbund[otuData[i]] = seqFreq[i]; + maxSequenceIndex[otuData[i]] = i; + } + } + + int count = 1; + + for(int i=0;icontrol_pressed) { return 0; } + + if(finalTau[i] > 0){ + + if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){ + // cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl; + centroids[i] = maxSequenceIndex[i]; + } + + int index = centroids[i]; + + fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl; + namesFile << uniqueNames[index] << '\t'; + + string refSeq = sequences[index]; + string redundantSeqs = redundantNames[index];; + + + vector frequencyData; + + for(int j=0;j rUnalign = convertSeq(refDegap); + + uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl; + uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl; + + + for(int j=0;jcontrol_pressed) { return 0; } + redundantSeqs += ',' + redundantNames[frequencyData[j].index]; + + uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t'; + + string querySeq = sequences[frequencyData[j].index]; + + string queryDegap = degapSeq(querySeq); + vector qUnalign = convertSeq(queryDegap); + + int udiffs = countDiffs(qUnalign, rUnalign); + uMapFile << udiffs << '\t' << queryDegap << endl; + + } + + uMapFile << endl; + namesFile << redundantSeqs << endl; + count++; + + } + } + fastaFile.close(); + namesFile.close(); + uMapFile.close(); + return 0; + } + catch(exception& e) { + m->errorOut(e, "seqNoise", "writeOutput"); + exit(1); + } +} + +/************************************************************************************************** + +int main(int argc, char *argv[]){ + + double sigma = 100; + sigma = atof(argv[5]); + + double cutOff = 0.08; + int minIter = 10; + int maxIter = 1000; + double minDelta = 1e-6; + + string sequenceFileName = argv[1]; + string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh"; + + vector sequences; + getSequenceData(sequenceFileName, sequences); + + int numSeqs = sequences.size(); + + vector uniqueNames(numSeqs); + vector redundantNames(numSeqs); + vector seqFreq(numSeqs); + + string namesFileName = argv[4]; + getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq); + + string distFileName = argv[2]; + vector distances(numSeqs * numSeqs); + getDistanceData(distFileName, distances); + + string listFileName = argv[3]; + vector otuData(numSeqs); + vector otuFreq; + vector > otuBySeqLookUp; + + getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp); + + int numOTUs = otuFreq.size(); + + vector weights(numOTUs, 0); + vector change(numOTUs, 1); + vector centroids(numOTUs, -1); + vector cumCount(numOTUs, 0); + + vector tau(numSeqs, 1); + vector anP(numSeqs, 0); + vector anI(numSeqs, 0); + vector anN(numSeqs, 0); + vector > aanI = otuBySeqLookUp; + + int numIters = 0; + double maxDelta = 1e6; + + while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){ + + updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); + maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); + + calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); + checkCentroids(weights, centroids); + + otuFreq.assign(numOTUs, 0); + + int total = 0; + + for(int i=0;i currentTau(numOTUs); + + for(int j=0;j minWeight && distances[i * numSeqs+centroids[j]] < offset){ + offset = distances[i * numSeqs+centroids[j]]; + } + } + + for(int j=0;j minWeight){ + currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j]; + norm += currentTau[j]; + } + else{ + currentTau[j] = 0.0000; + } + } + + for(int j=0;j MIN_TAU){ + int oldTotal = total; + total++; + + tau.resize(oldTotal+1); + tau[oldTotal] = currentTau[j]; + otuBySeqLookUp[j][otuFreq[j]] = oldTotal; + aanI[j][otuFreq[j]] = i; + otuFreq[j]++; + + } + } + + anP.resize(total); + anI.resize(total); + } + + numIters++; + } + + updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); + + vector percentage(numSeqs); + setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); + finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); + + change.assign(numOTUs, 1); + calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); + + + vector finalTau(numOTUs, 0); + for(int i=0;i&); + int addSeq(string, vector&); + int getRedundantNames(string, vector&, vector&, vector&); + int addRedundantName(string, string, vector&, vector&, vector&); + int getDistanceData(string, vector&); + int getListData(string, double, vector&, vector&, vector >&); + int updateOTUCountData(vector, vector >, vector >, vector&, vector&, vector&); + double calcNewWeights(vector&,vector,vector,vector,vector,vector,vector); + int calcCentroids(vector,vector,vector&,vector&,vector,vector,vector,vector,vector); + int checkCentroids(vector&, vector); + int setUpOTUData(vector&, vector&, vector, vector, vector, vector, vector); + int finishOTUData(vector, vector&, vector&, vector&, vector&, vector >&, vector >&, vector&); + int writeOutput(string, string, string, vector, vector, vector, vector, vector, vector, vector, vector&); + + +private: + MothurOut* m; + + int getLastMatch(char, vector >&, int, int, vector&, vector&); + int countDiffs(vector, vector); + vector convertSeq(string); + string degapSeq(string); + +}; + +/**************************************************************************************************/ +#endif + diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp index f42d24b..4a0e5e8 100644 --- a/sharedrabundvector.cpp +++ b/sharedrabundvector.cpp @@ -72,7 +72,7 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), //are we at the beginning of the file?? if (m->saveNextLabel == "") { f >> label; - + //is this a shared file that has headers if (label == "label") { //gets "group" @@ -104,7 +104,6 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), //read in first row since you know there is at least 1 group. f >> groupN >> num; - holdLabel = label; //add new vector to lookup @@ -155,7 +154,6 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), } m->saveNextLabel = nextLabel; m->setAllGroups(allGroups); - } catch(exception& e) { m->errorOut(e, "SharedRAbundVector", "SharedRAbundVector"); diff --git a/shhhseqscommand.cpp b/shhhseqscommand.cpp new file mode 100644 index 0000000..625b939 --- /dev/null +++ b/shhhseqscommand.cpp @@ -0,0 +1,681 @@ +/* + * shhhseqscommand.cpp + * Mothur + * + * Created by westcott on 11/8/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "shhhseqscommand.h" + + + +//********************************************************************************************************************** +vector ShhhSeqsCommand::setParameters(){ + try { + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); + CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter psigma("sigma", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(psigma); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string ShhhSeqsCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The shhh.seqs command reads a fasta and name file and ....\n"; + helpString += "The shhh.seqs command parameters are fasta, name, group, sigma and processors.\n"; + helpString += "The fasta parameter allows you to enter the fasta file containing your potentially sequences, and is required, unless you have a valid current fasta file. \n"; + helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n"; + helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n"; + helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"; + helpString += "The sigma parameter .... The default is 0.01. \n"; + helpString += "The shhh.seqs command should be in the following format: \n"; + helpString += "shhh.seqs(fasta=yourFastaFile, name=yourNameFile) \n"; + helpString += "Example: shhh.seqs(fasta=AD.align, name=AD.names) \n"; + helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n"; + return helpString; + + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** + +ShhhSeqsCommand::ShhhSeqsCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["map"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand"); + exit(1); + } +} + +//********************************************************************************************************************** +ShhhSeqsCommand::ShhhSeqsCommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + + //check to make sure all parameters are valid for command + for (map::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { + if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["name"] = tempOutNames; + outputTypes["map"] = tempOutNames; + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("fasta"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fasta"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } + } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not found") { + fastafile = m->getFastaFile(); + if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } + } + else if (fastafile == "not open") { abort = true; } + else { m->setFastaFile(fastafile); } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not found") { + namefile = m->getNameFile(); + if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; } + } + else if (namefile == "not open") { namefile = ""; abort = true; } + else { m->setNameFile(namefile); } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not found") { groupfile = ""; } + else if (groupfile == "not open") { abort = true; groupfile = ""; } + else { m->setGroupFile(groupfile); } + + string temp = validParameter.validFile(parameters, "sigma", false); if(temp == "not found"){ temp = "0.01"; } + convert(temp, sigma); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + convert(temp, processors); + } + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int ShhhSeqsCommand::execute() { + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + if (outputDir == "") { outputDir = m->hasPath(fastafile); }//if user entered a file with a path then preserve it + string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.fasta"; + string nameFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.names"; + string mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.map"; + + if (groupfile != "") { + //Parse sequences by group + SequenceParser parser(groupfile, fastafile, namefile); + vector groups = parser.getNamesOfGroups(); + + if (m->control_pressed) { return 0; } + + //clears files + ofstream out, out1, out2; + m->openOutputFile(outputFileName, out); out.close(); + m->openOutputFile(nameFileName, out1); out1.close(); + mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh."; + + if(processors == 1) { driverGroups(parser, outputFileName, nameFileName, mapFileName, 0, groups.size(), groups); } + else { createProcessesGroups(parser, outputFileName, nameFileName, mapFileName, groups); } + + if (m->control_pressed) { return 0; } + + //deconvolute results by running unique.seqs + + + if (m->control_pressed) { return 0; } + + }else{ + vector sequences; + vector uniqueNames; + vector redundantNames; + vector seqFreq; + + seqNoise noise; + correctDist* correct = new correctDist(processors); + + //reads fasta and name file and loads them in order + readData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq); + if (m->control_pressed) { return 0; } + + //calc distances for cluster + string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.dist"; + correct->execute(distFileName); + delete correct; + + if (m->control_pressed) { m->mothurRemove(distFileName); return 0; } + + driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, outputFileName, nameFileName, mapFileName); + } + + if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; } + + outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName); + outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName); + outputNames.push_back(mapFileName); outputTypes["map"].push_back(mapFileName); + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + //set accnos file as new current accnosfile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int ShhhSeqsCommand::readData(correctDist* correct, seqNoise& noise, vector& seqs, vector& uNames, vector& rNames, vector& freq) { + try { + map nameMap; + map::iterator it; + m->readNames(namefile, nameMap); + bool error = false; + + ifstream in; + m->openInputFile(fastafile, in); + + while (!in.eof()) { + + if (m->control_pressed) { in.close(); return 0; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + correct->addSeq(seq.getName(), seq.getAligned()); + + it = nameMap.find(seq.getName()); + if (it != nameMap.end()) { + noise.addSeq(seq.getAligned(), seqs); + noise.addRedundantName(it->first, it->second, uNames, rNames, freq); + }else { + m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your namefile, please correct."); + error = true; + } + } + } + in.close(); + + if (error) { m->control_pressed = true; } + + return seqs.size(); + + }catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "readData"); + exit(1); + } +} +//********************************************************************************************************************** +int ShhhSeqsCommand::loadData(correctDist* correct, seqNoise& noise, vector& seqs, vector& uNames, vector& rNames, vector& freq, map& nameMap, vector& sequences) { + try { + bool error = false; + map::iterator it; + + for (int i = 0; i < sequences.size(); i++) { + + if (m->control_pressed) { return 0; } + + if (sequences[i].getName() != "") { + correct->addSeq(sequences[i].getName(), sequences[i].getAligned()); + + it = nameMap.find(sequences[i].getName()); + if (it != nameMap.end()) { + noise.addSeq(sequences[i].getAligned(), seqs); + noise.addRedundantName(it->first, it->second, uNames, rNames, freq); + }else { + m->mothurOut("[ERROR]: " + sequences[i].getName() + " is in your fasta file and not in your namefile, please correct."); + error = true; + } + } + } + + if (error) { m->control_pressed = true; } + + return seqs.size(); + + }catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "loadData"); + exit(1); + } +} +/**************************************************************************************************/ +int ShhhSeqsCommand::createProcessesGroups(SequenceParser& parser, string newFName, string newNName, string newMName, vector groups) { + try { + + vector processIDS; + int process = 1; + + //sanity check + if (groups.size() < processors) { processors = groups.size(); } + + //divide the groups between the processors + vector lines; + int numGroupsPerProcessor = groups.size() / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numGroupsPerProcessor; + int endIndex = (i+1) * numGroupsPerProcessor; + if(i == (processors - 1)){ endIndex = groups.size(); } + lines.push_back(linePair(startIndex, endIndex)); + } + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMName, lines[process].start, lines[process].end, groups); + exit(0); + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } + } + + //do my part + driverGroups(parser, newFName, newNName, newMName, lines[0].start, lines[0].end, groups); + + //force parent to wait until all the processes are done + for (int i=0;i pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; iappendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName); + m->mothurRemove((newFName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName); + m->mothurRemove((newNName + toString(processIDS[i]) + ".temp")); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "createProcessesGroups"); + exit(1); + } +} +/**************************************************************************************************/ +int ShhhSeqsCommand::driverGroups(SequenceParser& parser, string newFFile, string newNFile, string newMFile, int start, int end, vector groups){ + try { + + for (int i = start; i < end; i++) { + + start = time(NULL); + + if (m->control_pressed) { return 0; } + + m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine(); + + map thisNameMap; + thisNameMap = parser.getNameMap(groups[i]); + vector thisSeqs = parser.getSeqs(groups[i]); + + vector sequences; + vector uniqueNames; + vector redundantNames; + vector seqFreq; + + seqNoise noise; + correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load. + + //load this groups info in order + loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs); + if (m->control_pressed) { return 0; } + + //calc distances for cluster + string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + groups[i] + ".shhh.dist"; + correct->execute(distFileName); + delete correct; + + if (m->control_pressed) { m->mothurRemove(distFileName); return 0; } + + driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); + + if (m->control_pressed) { return 0; } + + m->appendFiles(newFFile+groups[i], newFFile); m->mothurRemove(newFFile+groups[i]); + m->appendFiles(newNFile+groups[i], newNFile); m->mothurRemove(newNFile+groups[i]); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + groups[i] + "."); m->mothurOutEndLine(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "driverGroups"); + exit(1); + } +} +//********************************************************************************************************************** +int ShhhSeqsCommand::driver(seqNoise& noise, + vector& sequences, + vector& uniqueNames, + vector& redundantNames, + vector& seqFreq, + string distFileName, string outputFileName, string nameFileName, string mapFileName) { + try { + double cutOff = 0.08; + int minIter = 10; + int maxIter = 1000; + double minDelta = 1e-6; + int numIters = 0; + double maxDelta = 1e6; + int numSeqs = sequences.size(); + + //run cluster command + string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08"; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: cluster(" + inputString + ")"); m->mothurOutEndLine(); + + Command* clusterCommand = new ClusterCommand(inputString); + clusterCommand->execute(); + + map > filenames = clusterCommand->getOutputFiles(); + string listFileName = filenames["list"][0]; + string rabundFileName = filenames["rabund"][0]; m->mothurRemove(rabundFileName); + string sabundFileName = filenames["sabund"][0]; m->mothurRemove(sabundFileName); + + delete clusterCommand; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + if (m->control_pressed) { m->mothurRemove(distFileName); m->mothurRemove(listFileName); return 0; } + + vector distances(numSeqs * numSeqs); + noise.getDistanceData(distFileName, distances); + m->mothurRemove(distFileName); + if (m->control_pressed) { m->mothurRemove(listFileName); return 0; } + + vector otuData(numSeqs); + vector otuFreq; + vector > otuBySeqLookUp; + noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp); + m->mothurRemove(listFileName); + if (m->control_pressed) { return 0; } + + int numOTUs = otuFreq.size(); + + vector weights(numOTUs, 0); + vector change(numOTUs, 1); + vector centroids(numOTUs, -1); + vector cumCount(numOTUs, 0); + + vector tau(numSeqs, 1); + vector anP(numSeqs, 0); + vector anI(numSeqs, 0); + vector anN(numSeqs, 0); + vector > aanI = otuBySeqLookUp; + + while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){ + + if (m->control_pressed) { return 0; } + + noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (m->control_pressed) { return 0; } + maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); if (m->control_pressed) { return 0; } + + noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; } + noise.checkCentroids(weights, centroids); if (m->control_pressed) { return 0; } + + otuFreq.assign(numOTUs, 0); + + int total = 0; + + for(int i=0;icontrol_pressed) { return 0; } + + double offset = 1e6; + double norm = 0.0000; + double minWeight = 0.1; + vector currentTau(numOTUs); + + for(int j=0;jcontrol_pressed) { return 0; } + if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){ + offset = distances[i * numSeqs+centroids[j]]; + } + } + + for(int j=0;jcontrol_pressed) { return 0; } + if(weights[j] > minWeight){ + currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j]; + norm += currentTau[j]; + } + else{ + currentTau[j] = 0.0000; + } + } + + for(int j=0;jcontrol_pressed) { return 0; } + currentTau[j] /= norm; + } + + for(int j=0;jcontrol_pressed) { return 0; } + + if(currentTau[j] > 1.0e-4){ + int oldTotal = total; + total++; + + tau.resize(oldTotal+1); + tau[oldTotal] = currentTau[j]; + otuBySeqLookUp[j][otuFreq[j]] = oldTotal; + aanI[j][otuFreq[j]] = i; + otuFreq[j]++; + + } + } + + anP.resize(total); + anI.resize(total); + } + + numIters++; + } + + noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (m->control_pressed) { return 0; } + + vector percentage(numSeqs); + noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); if (m->control_pressed) { return 0; } + noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); if (m->control_pressed) { return 0; } + + change.assign(numOTUs, 1); + noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; } + + + vector finalTau(numOTUs, 0); + for(int i=0;icontrol_pressed) { return 0; } + finalTau[otuData[i]] += int(seqFreq[i]); + } + + noise.writeOutput(outputFileName, nameFileName, mapFileName, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances); + + return 0; + + }catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** +int ShhhSeqsCommand::deconvoluteResults(string fastaFile, string nameFile){ + try { + m->mothurOutEndLine(); m->mothurOut("Deconvoluting results:"); m->mothurOutEndLine(); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + fastaFile + ", name=" + nameFile; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + string newnameFile = filenames["name"][0]; + string newfastaFile = filenames["fasta"][0]; + + m->mothurRemove(fastaFile); rename(newfastaFile.c_str(), fastaFile.c_str()); + m->mothurRemove(nameFile); rename(newnameFile.c_str(), nameFile.c_str()); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "ShhhSeqsCommand", "deconvoluteResults"); + exit(1); + } +} +//********************************************************************************************************************** + + + diff --git a/shhhseqscommand.h b/shhhseqscommand.h new file mode 100644 index 0000000..5aceca8 --- /dev/null +++ b/shhhseqscommand.h @@ -0,0 +1,332 @@ +#ifndef SHHHSEQSCOMMAND_H +#define SHHHSEQSCOMMAND_H + +/* + * shhhseqscommand.h + * Mothur + * + * Created by westcott on 11/8/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" +#include "myseqdist.h" +#include "seqnoise.h" +#include "sequenceparser.h" +#include "deconvolutecommand.h" +#include "clustercommand.h" + +//********************************************************************************************************************** + +class ShhhSeqsCommand : public Command { + +public: + ShhhSeqsCommand(string); + ShhhSeqsCommand(); + ~ShhhSeqsCommand() {} + + vector setParameters(); + string getCommandName() { return "shhh.seqs"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/Shhh.seqs"; } + string getDescription() { return "shhh.seqs"; } + + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + bool abort; + string outputDir, fastafile, namefile, groupfile; + int processors; + double sigma; + vector outputNames; + + int readData(correctDist*, seqNoise&, vector&, vector&, vector&, vector&); + int loadData(correctDist*, seqNoise&, vector&, vector&, vector&, vector&, map&, vector&); + + int driver(seqNoise&, vector&, vector&, vector&, vector&, string, string, string, string); + int driverGroups(SequenceParser&, string, string, string, int, int, vector); + int createProcessesGroups(SequenceParser&, string, string, string, vector); + int deconvoluteResults(string, string); + + + +}; + +/**************************************************************************************************/ +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct shhhseqsData { + string fastafile; + string namefile; + string groupfile; + string newFName, newNName, newMName; + MothurOut* m; + int start; + int end; + int sigma, threadID; + vector groups; + + shhhseqsData(){} + shhhseqsData(string f, string n, string g, string nff, string nnf, string nmf, vector gr, MothurOut* mout, int st, int en, int s, int tid) { + fastafile = f; + namefile = n; + groupfile = g; + newFName = nff; + newNName = nnf; + newNName = nmf; + m = mout; + start = st; + end = en; + sigma = s; + threadID = tid; + groups = gr; + } +}; + +/**************************************************************************************************/ +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) +#else +static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){ + shhhseqsData* pDataArray; + pDataArray = (shhhseqsData*)lpParam; + + try { + + //parse fasta and name file by group + SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); + + //precluster each group + for (int k = pDataArray->start; k < pDataArray->end; k++) { + + int start = time(NULL); + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine(); + + map thisNameMap; + thisNameMap = parser.getNameMap(pDataArray->groups[k]); + vector thisSeqs = parser.getSeqs(pDataArray->groups[k]); + + if (pDataArray->m->control_pressed) { return 0; } + + vector sequences; + vector uniqueNames; + vector redundantNames; + vector seqFreq; + + seqNoise noise; + correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load. + + //load this groups info in order + //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs); + ////////////////////////////////////////////////////////////////////////////////////////////////// + bool error = false; + map::iterator it; + + for (int i = 0; i < thisSeqs.size(); i++) { + + if (pDataArray->m->control_pressed) { return 0; } + + if (thisSeqs[i].getName() != "") { + correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned()); + + it = thisNameMap.find(thisSeqs[i].getName()); + if (it != thisNameMap.end()) { + noise.addSeq(thisSeqs[i].getAligned(), sequences); + noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq); + }else { + pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct."); + error = true; + } + } + } + + if (error) { return 0; } + ////////////////////////////////////////////////////////////////////////////////////////////////// + + if (pDataArray->m->control_pressed) { return 0; } + + //calc distances for cluster + string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist"; + correct->execute(distFileName); + delete correct; + + if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; } + + //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); + /////////////////////////////////////////////////////////////////////////////////////////////////// + double cutOff = 0.08; + int minIter = 10; + int maxIter = 1000; + double minDelta = 1e-6; + int numIters = 0; + double maxDelta = 1e6; + int numSeqs = sequences.size(); + + //run cluster command + string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08"; + pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine(); + + Command* clusterCommand = new ClusterCommand(inputString); + clusterCommand->execute(); + + map > filenames = clusterCommand->getOutputFiles(); + string listFileName = filenames["list"][0]; + string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName); + string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName); + + delete clusterCommand; + pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); + + if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; } + + vector distances(numSeqs * numSeqs); + noise.getDistanceData(distFileName, distances); + pDataArray->m->mothurRemove(distFileName); + if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; } + + vector otuData(numSeqs); + vector otuFreq; + vector > otuBySeqLookUp; + noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp); + pDataArray->m->mothurRemove(listFileName); + if (pDataArray->m->control_pressed) { return 0; } + + int numOTUs = otuFreq.size(); + + vector weights(numOTUs, 0); + vector change(numOTUs, 1); + vector centroids(numOTUs, -1); + vector cumCount(numOTUs, 0); + + vector tau(numSeqs, 1); + vector anP(numSeqs, 0); + vector anI(numSeqs, 0); + vector anN(numSeqs, 0); + vector > aanI = otuBySeqLookUp; + + while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){ + + if (pDataArray->m->control_pressed) { return 0; } + + noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; } + maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; } + + noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; } + noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; } + + otuFreq.assign(numOTUs, 0); + + int total = 0; + + for(int i=0;im->control_pressed) { return 0; } + + double offset = 1e6; + double norm = 0.0000; + double minWeight = 0.1; + vector currentTau(numOTUs); + + for(int j=0;jm->control_pressed) { return 0; } + if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){ + offset = distances[i * numSeqs+centroids[j]]; + } + } + + for(int j=0;jm->control_pressed) { return 0; } + if(weights[j] > minWeight){ + currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j]; + norm += currentTau[j]; + } + else{ + currentTau[j] = 0.0000; + } + } + + for(int j=0;jm->control_pressed) { return 0; } + currentTau[j] /= norm; + } + + for(int j=0;jm->control_pressed) { return 0; } + + if(currentTau[j] > 1.0e-4){ + int oldTotal = total; + total++; + + tau.resize(oldTotal+1); + tau[oldTotal] = currentTau[j]; + otuBySeqLookUp[j][otuFreq[j]] = oldTotal; + aanI[j][otuFreq[j]] = i; + otuFreq[j]++; + + } + } + + anP.resize(total); + anI.resize(total); + } + + numIters++; + } + + noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; } + + vector percentage(numSeqs); + noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); if (pDataArray->m->control_pressed) { return 0; } + noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); if (pDataArray->m->control_pressed) { return 0; } + + change.assign(numOTUs, 1); + noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; } + + + vector finalTau(numOTUs, 0); + for(int i=0;im->control_pressed) { return 0; } + finalTau[otuData[i]] += int(seqFreq[i]); + } + + noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances); + + /////////////////////////////////////////////////////////////////////////////////////////////////// + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]); + pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]); + + pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine(); + + + } + + return 0; + + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction"); + exit(1); + } +} +#endif +/**************************************************************************************************/ + +#endif diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index 6d69735..7a1af62 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -272,32 +272,34 @@ int TrimFlowsCommand::execute(){ ofstream output; if(allFiles){ - + set namesAlreadyProcessed; flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files"; m->openOutputFile(flowFilesFileName, output); for(int i=0;imothurRemove(barcodePrimerComboFileNames[i][j]); - } - else{ - output << barcodePrimerComboFileNames[i][j] << endl; - outputNames.push_back(barcodePrimerComboFileNames[i][j]); - outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]); + if (namesAlreadyProcessed.count(barcodePrimerComboFileNames[i][j]) == 0) { + FILE * pFile; + unsigned long long size; + + //get num bytes in file + pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell(pFile); + fclose (pFile); + } + + if(size < 10){ + m->mothurRemove(barcodePrimerComboFileNames[i][j]); + } + else{ + output << barcodePrimerComboFileNames[i][j] << endl; + outputNames.push_back(barcodePrimerComboFileNames[i][j]); + outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]); + } + namesAlreadyProcessed.insert(barcodePrimerComboFileNames[i][j]); } } } -- 2.39.2