A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; };
A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7730EFE13967241007433A3 /* countseqscommand.cpp */; };
+ A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774101314695AF60098E6AC /* shhhseqscommand.cpp */; };
+ A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; };
+ A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; };
A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; };
A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = "<group>"; };
A7730EFD13967241007433A3 /* countseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countseqscommand.h; sourceTree = "<group>"; };
A7730EFE13967241007433A3 /* countseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countseqscommand.cpp; sourceTree = "<group>"; };
+ A774101214695AF60098E6AC /* shhhseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shhhseqscommand.h; sourceTree = "<group>"; };
+ A774101314695AF60098E6AC /* shhhseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shhhseqscommand.cpp; sourceTree = "<group>"; };
+ A774104614696F320098E6AC /* myseqdist.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myseqdist.cpp; sourceTree = "<group>"; };
+ A774104714696F320098E6AC /* myseqdist.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myseqdist.h; sourceTree = "<group>"; };
+ A77410F414697C300098E6AC /* seqnoise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqnoise.cpp; sourceTree = "<group>"; };
+ A77410F514697C300098E6AC /* seqnoise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqnoise.h; sourceTree = "<group>"; };
A778FE69134CA6CA00C0BA33 /* getcommandinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcommandinfocommand.h; sourceTree = "<group>"; };
A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = "<group>"; };
A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = "<group>"; };
A7E9B75C12D37EC400DA6239 /* mothur.h */,
A7E9B75D12D37EC400DA6239 /* mothurout.cpp */,
A7E9B75E12D37EC400DA6239 /* mothurout.h */,
+ A774104714696F320098E6AC /* myseqdist.h */,
+ A774104614696F320098E6AC /* myseqdist.cpp */,
A7E9B76112D37EC400DA6239 /* nast.cpp */,
A7E9B76212D37EC400DA6239 /* nast.hpp */,
A7E9B76312D37EC400DA6239 /* nastreport.cpp */,
A7E9B7AD12D37EC400DA6239 /* rarefactioncurvedata.h */,
7E6BE10812F710D8007ADDBE /* refchimeratest.h */,
7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */,
+ A77410F514697C300098E6AC /* seqnoise.h */,
+ A77410F414697C300098E6AC /* seqnoise.cpp */,
A7E9BA5312D39A5E00DA6239 /* read */,
A7E9B82312D37EC400DA6239 /* sharedutilities.cpp */,
A7E9B82412D37EC400DA6239 /* sharedutilities.h */,
A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
A7E9B82812D37EC400DA6239 /* shhhercommand.h */,
A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */,
+ A774101214695AF60098E6AC /* shhhseqscommand.h */,
+ A774101314695AF60098E6AC /* shhhseqscommand.cpp */,
A7E9B84012D37EC400DA6239 /* splitabundcommand.h */,
A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */,
A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */,
A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */,
A7BF221414587886000AD524 /* myPerseus.cpp in Sources */,
A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */,
+ A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */,
+ A774104814696F320098E6AC /* myseqdist.cpp in Sources */,
+ A77410F614697C300098E6AC /* seqnoise.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
- helpString += "The name parameter allows you to provide a name file associated with your fasta file. If none is given unique.seqs will be run to generate one. \n";
+ helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
vector<bool> chimeras(numSeqs, 0);
for(int i=0;i<numSeqs;i++){
- cout << sequences[i].seqName << endl << sequences[i].sequence << endl << sequences[i].frequency << endl;
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
vector<bool> restricted = chimeras;
vector<pwAlign> alignments(numSeqs);
int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
- cout << "comparisons = " << comparisons << endl;
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
if(comparisons >= 2){
minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
- cout << "minMismatchToChimera = " << minMismatchToChimera << endl;
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
int minMismatchToTrimera = numeric_limits<int>::max();
if(minMismatchToChimera >= 3 && comparisons >= 3){
minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
- cout << "minMismatchToTrimera = " << minMismatchToTrimera << endl;
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
}
double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
- cout << "singleDist = " << singleDist << endl;
+
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
string type;
type = "chimera";
chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
}
- cout << "chimeraRefSeq = " << chimeraRefSeq << endl;
+ ;
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
- cout << "chimeraDist = " << chimeraDist << endl;
+
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);
- cout << "loonIndex = " << loonIndex << endl;
+
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
//Wait until all threads have terminated.
WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
- cout << "here" << endl;
+
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
int numSeqs = 0;
int numChimeras = 0;
- //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
if(processors == 1){ numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); }
else{ numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); }
- //#else
- // numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras);
- //#endif
+
+ //add headings
+ ofstream out;
+ m->openOutputFile(outputFileName+".temp", out);
+ out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n";
+ out.close();
+
+ m->appendFiles(outputFileName, outputFileName+".temp");
+ m->mothurRemove(outputFileName); rename((outputFileName+".temp").c_str(), outputFileName.c_str());
+
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
//remove file made for uchime
ofstream out;
m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+ out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n";
float temp1;
string parent1, parent2, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, flag;
alns = "\"" + alns + "\"";
vector<char*> cPara;
-
- char* tempUchime;
+
+ string path = m->argv;
+ string tempPath = path;
+ for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
+ path = path.substr(0, (tempPath.find_last_of('m')));
+
+ string uchimeCommand = path;
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- tempUchime= new char[10];
- *tempUchime = '\0';
- strncat(tempUchime, "./uchime ", 9);
+ uchimeCommand += "uchime ";
#else
- tempUchime= new char[8];
- *tempUchime = '\0';
- strncat(tempUchime, "uchime ", 7);
+ uchimeCommand += "uchime";
+ uchimeCommand = "\"" + uchimeCommand + "\"";
#endif
+
+ char* tempUchime;
+ tempUchime= new char[uchimeCommand.length()+1];
+ *tempUchime = '\0';
+ strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
cPara.push_back(tempUchime);
char* tempIn = new char[8];
//uchime_main(numArgs, uchimeParameters);
//cout << "commandString = " << commandString << endl;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+ commandString = "\"" + commandString + "\"";
+#endif
system(commandString.c_str());
//free memory
string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
int processors;
+
vector<string> outputNames;
vector<string> fastaFileNames;
vector<string> nameFileNames;
vector<char*> cPara;
+ string path = pDataArray->m->argv;
+ string tempPath = path;
+ for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
+ path = path.substr(0, (tempPath.find_last_of('m')));
+
+ string uchimeCommand = path;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ uchimeCommand += "uchime ";
+#else
+ uchimeCommand += "uchime";
+ uchimeCommand = "\"" + uchimeCommand + "\"";
+#endif
+
char* tempUchime;
- tempUchime= new char[8];
+ tempUchime= new char[uchimeCommand.length()+1];
*tempUchime = '\0';
- strncat(tempUchime, "uchime ", 7);
+ strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
cPara.push_back(tempUchime);
char* tempIn = new char[8];
//uchime_main(numArgs, uchimeParameters);
//cout << "commandString = " << commandString << endl;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+ commandString = "\"" + commandString + "\"";
+#endif
system(commandString.c_str());
//free memory
#include "clearmemorycommand.h"
#include "summarytaxcommand.h"
#include "chimeraperseuscommand.h"
+#include "shhhseqscommand.h"
/*******************************************************/
commands["shhh.flows"] = "MPIEnabled";
commands["sens.spec"] = "sens.spec";
commands["seq.error"] = "seq.error";
- commands["seq.error"] = "summary.tax";
+ commands["summary.tax"] = "summary.tax";
+ commands["shhh.seqs"] = "shhh.seqs";
commands["quit"] = "MPIEnabled";
else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); }
else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); }
else if(commandName == "chimera.perseus") { command = new ChimeraPerseusCommand(optionString); }
+ else if(commandName == "shhh.seqs") { command = new ShhhSeqsCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); }
else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); }
else if(commandName == "chimera.perseus") { pipecommand = new ChimeraPerseusCommand(optionString); }
+ else if(commandName == "shhh.seqs") { pipecommand = new ShhhSeqsCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); }
else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); }
else if(commandName == "chimera.perseus") { shellcommand = new ChimeraPerseusCommand(); }
+ else if(commandName == "shhh.seqs") { shellcommand = new ShhhSeqsCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
//custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
-typedef struct distanceData {
+struct distanceData {
int startLine;
int endLine;
string dFileName;
//get set names
string setA = namesOfGroupCombos[c][0];
string setB = namesOfGroupCombos[c][1];
- cout << setA << '\t' << setB << endl;
+ //cout << setA << '\t' << setB << endl;
//get filename
string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName);
}
}
- for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
- cout << setACount << endl;
+ //for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
+ //cout << setACount << endl;
if ((setACount == 0) || (setBCount == 0)) {
m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine();
#define isnan(x) ((x) != (x))
#define isinf(x) (fabs(x) == std::numeric_limits<double>::infinity())
+
typedef unsigned long ull;
struct IntNode {
rightParent = -1;
breakPoint = -1;
- for(int l=0;l<seqLength;l++){
+ for(int l=0;l<seqLength-1;l++){
if (m->control_pressed) { return 0; }
--- /dev/null
+/*
+ * pds.seqdist.cpp
+ *
+ *
+ * Created by Pat Schloss on 8/12/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "myseqdist.h"
+#include "sequence.hpp"
+
+/**************************************************************************************************/
+correctDist::correctDist(int p) : processors(p) {
+ try {
+ m = MothurOut::getInstance();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "correctDist");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+correctDist::correctDist(string sequenceFileName, int p) : processors(p) {
+ try {
+ m = MothurOut::getInstance();
+ getSequences(sequenceFileName);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "correctDist");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int correctDist::addSeq(string seqName, string seqSeq){
+ try {
+ names.push_back(seqName);
+ sequences.push_back(fixSequence(seqSeq));
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "addSeq");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int correctDist::execute(string distanceFileName){
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+ processors = 1;
+#endif
+ correctMatrix.resize(4);
+ for(int i=0;i<4;i++){ correctMatrix[i].resize(4); }
+
+ correctMatrix[0][0] = 0.000000; //AA
+ correctMatrix[1][0] = 11.619259; //CA
+ correctMatrix[2][0] = 11.694004; //TA
+ correctMatrix[3][0] = 7.748623; //GA
+
+ correctMatrix[1][1] = 0.000000; //CC
+ correctMatrix[2][1] = 7.619657; //TC
+ correctMatrix[3][1] = 12.852562; //GC
+
+ correctMatrix[2][2] = 0.000000; //TT
+ correctMatrix[3][2] = 10.964048; //TG
+
+ correctMatrix[3][3] = 0.000000; //GG
+
+ for(int i=0;i<4;i++){
+ for(int j=0;j<i;j++){
+ correctMatrix[j][i] = correctMatrix[i][j];
+ }
+ }
+
+ numSeqs = names.size();
+
+ if(processors == 1){ driver(0, numSeqs, distanceFileName); }
+ else{
+
+ for(int i=0;i<processors;i++){
+ start.push_back(int (sqrt(float(i)/float(processors)) * numSeqs));
+ end.push_back(int (sqrt(float(i+1)/float(processors)) * numSeqs));
+ }
+
+ createProcess(distanceFileName);
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "execute");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int correctDist::getSequences(string sequenceFileName){
+ try {
+ ifstream sequenceFile;
+ m->openInputFile(sequenceFileName, sequenceFile);
+ string seqName, seqSeq;
+
+ while(!sequenceFile.eof()){
+ if (m->control_pressed) { break; }
+
+ Sequence temp(sequenceFile); m->gobble(sequenceFile);
+
+ if (temp.getName() != "") {
+ names.push_back(temp.getName());
+ sequences.push_back(fixSequence(temp.getAligned()));
+ }
+ }
+ sequenceFile.close();
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "getSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+vector<int> correctDist::fixSequence(string sequence){
+ try {
+ int alignLength = sequence.length();
+
+ vector<int> seqVector;
+
+ for(int i=0;i<alignLength;i++){
+ if(sequence[i] == 'A') { seqVector.push_back(0); }
+ else if(sequence[i] == 'C') { seqVector.push_back(1); }
+ else if(sequence[i] == 'T') { seqVector.push_back(2); }
+ else if(sequence[i] == 'G') { seqVector.push_back(3); }
+ else if(sequence[i] == 'N') { seqVector.push_back(0); }//hmmmm....
+ }
+
+ return seqVector;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "fixSequence");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int correctDist::createProcess(string distanceFileName){
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 1;
+ vector<int> processIDs;
+
+ while(process != processors){
+
+ int pid = fork();
+
+ if(pid > 0){
+ processIDs.push_back(pid);
+ process++;
+ }
+ else if(pid == 0){
+ driver(start[process], end[process], distanceFileName + toString(getpid()) + ".temp");
+ exit(0);
+ }
+ else{
+ cout << "Doh!" << endl;
+ for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill(temp, SIGINT); }
+ exit(0);
+ }
+ }
+
+ driver(start[0], end[0], distanceFileName);
+
+ for (int i=0;i<processIDs.size();i++) {
+ int temp = processIDs[i];
+ wait(&temp);
+ }
+
+ for(int i=0;i<processIDs.size();i++){
+ m->appendFiles((distanceFileName + toString(processIDs[i]) + ".temp"), distanceFileName);
+ remove((distanceFileName + toString(processIDs[i]) + ".temp").c_str());
+ }
+#endif
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "createProcess");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int correctDist::driver(int start, int end, string distFileName){
+ try {
+ ofstream distFile;
+ m->openOutputFile(distFileName, distFile);
+ distFile << setprecision(9);
+
+ if(start == 0){
+ distFile << numSeqs << endl;
+ }
+
+ int startTime = time(NULL);
+
+ m->mothurOut("\nCalculating distances...\n");
+
+ for(int i=start;i<end;i++){
+
+ distFile << i;
+
+ for(int j=0;j<i;j++){
+
+ if (m->control_pressed) { distFile.close(); return 0; }
+
+ double dist = getDist(sequences[i], sequences[j]);
+
+ distFile << ' ' << dist;
+ }
+ distFile << endl;
+
+ if(i % 100 == 0){ m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); }
+ }
+ distFile.close();
+
+ if((end-1) % 100 != 0){ m->mothurOut(toString(end-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); }
+ m->mothurOut("Done.\n");
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "driver");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double correctDist::getDist(vector<int>& seqA, vector<int>& seqB){
+ try {
+
+ int lengthA = seqA.size();
+ int lengthB = seqB.size();
+
+ vector<vector<double> > alignMatrix(lengthA+1);
+ vector<vector<char> > alignMoves(lengthA+1);
+
+ for(int i=0;i<=lengthA;i++){
+ alignMatrix[i].resize(lengthB+1, 0);
+ alignMoves[i].resize(lengthB+1, 'x');
+ }
+
+ for(int i=0;i<=lengthA;i++){
+ alignMatrix[i][0] = 15.0 * i;
+ alignMoves[i][0] = 'u';
+ }
+ for(int i=0;i<=lengthB;i++){
+ alignMatrix[0][i] = 15.0 * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=lengthA;i++){
+ for(int j=1;j<=lengthB;j++){
+
+ if (m->control_pressed) { return 0; }
+
+ double nogap;
+ nogap = alignMatrix[i-1][j-1] + correctMatrix[seqA[i-1]][seqB[j-1]];
+
+
+ double gap;
+ double left;
+ if(i == lengthA){ //terminal gap
+ left = alignMatrix[i][j-1];
+ }
+ else{
+ if(seqB[j-1] == getLastMatch('l', alignMoves, i, j, seqA, seqB)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ left = alignMatrix[i][j-1] + gap;
+ }
+
+
+ double up;
+ if(j == lengthB){ //terminal gap
+ up = alignMatrix[i-1][j];
+ }
+ else{
+
+ if(seqA[i-1] == getLastMatch('u', alignMoves, i, j, seqA, seqB)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ up = alignMatrix[i-1][j] + gap;
+ }
+
+
+
+ if(nogap < left){
+ if(nogap < up){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogap;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ else{
+ if(left < up){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = left;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ }
+ }
+
+ int i = lengthA;
+ int j = lengthB;
+ int count = 0;
+
+
+ // string alignA = "";
+ // string alignB = "";
+ // string bases = "ACTG";
+ //
+ // for(int i=0;i<lengthA;i++){
+ // cout << bases[seqA[i]];
+ // }cout << endl;
+ //
+ // for(int i=0;i<lengthB;i++){
+ // cout << bases[seqB[i]];
+ // }cout << endl;
+
+ while(i > 0 && j > 0){
+
+ if (m->control_pressed) { return 0; }
+
+ if(alignMoves[i][j] == 'd'){
+ // alignA = bases[seqA[i-1]] + alignA;
+ // alignB = bases[seqB[j-1]] + alignB;
+
+ count++;
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ if(j != lengthB){
+ // alignA = bases[seqA[i-1]] + alignA;
+ // alignB = '-' + alignB;
+ count++;
+ }
+
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ if(i != lengthA){
+ // alignA = '-' + alignA;
+ // alignB = bases[seqB[j-1]] + alignB;
+ count++;
+ }
+
+ j--;
+ }
+ }
+
+ // cout << alignA << endl << alignB << endl;
+
+ return alignMatrix[lengthA][lengthB] / (double)count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "getDist");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int correctDist::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
+ try {
+
+ char nullReturn = -1;
+
+ while(i>=1 && j>=1){
+
+ if (m->control_pressed) { return nullReturn; }
+
+ if(direction == 'd'){
+ if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
+ else { return nullReturn; }
+ }
+
+ else if(direction == 'l') { j--; }
+ else { i--; }
+
+ direction = alignMoves[i][j];
+ }
+
+ return nullReturn;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "correctDist", "getLastMatch");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+
+
--- /dev/null
+#ifndef CORRECTDIST_H
+#define CORRECTDIST_H
+
+
+/*
+ * pds.seqdist.h
+ *
+ *
+ * Created by Pat Schloss on 8/12/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothurout.h"
+
+/**************************************************************************************************/
+
+class correctDist {
+public:
+ correctDist(string, int);
+ correctDist(int);
+ ~correctDist(){}
+
+ int addSeq(string, string);
+ int execute(string);
+
+private:
+ MothurOut* m;
+ int getSequences(string);
+ vector<int> fixSequence(string);
+
+ int driver(int, int, string);
+ int createProcess(string);
+
+ double getDist(vector<int>&, vector<int>&);
+ int getLastMatch(char, vector<vector<char> >&, int, int, vector<int>&, vector<int>&);
+
+ vector<vector<double> > correctMatrix;
+
+ vector<vector<int> > sequences;
+
+ vector<string> names;
+ int numSeqs;
+ int processors;
+ vector<int> start;
+ vector<int> end;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
--- /dev/null
+/*
+ * mySeqNoise.cpp
+ *
+ *
+ * Created by Pat Schloss on 8/31/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "seqnoise.h"
+#include "sequence.hpp"
+
+#define MIN_DELTA 1.0e-6
+#define MIN_ITER 20
+#define MAX_ITER 1000
+#define MIN_COUNT 0.1
+#define MIN_TAU 1.0e-4
+#define MIN_WEIGHT 0.1
+
+
+/**************************************************************************************************/
+int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
+ try {
+
+ ifstream sequenceFile;
+ m->openInputFile(sequenceFileName, sequenceFile);
+
+ while(!sequenceFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ Sequence temp(sequenceFile); m->gobble(sequenceFile);
+
+ if (temp.getName() != "") {
+ sequences.push_back(temp.getAligned());
+ }
+ }
+ sequenceFile.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "getSequenceData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int seqNoise::addSeq(string seq, vector<string>& sequences){
+ try {
+ sequences.push_back(seq);
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "addSeq");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+//no checks for file mismatches
+int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
+ try {
+ string unique, redundant;
+ ifstream namesFile;
+ m->openInputFile(namesFileName, namesFile);
+
+ for(int i=0;i<redundantNames.size();i++){
+
+ if (m->control_pressed) { break; }
+
+ namesFile >> uniqueNames[i]; m->gobble(namesFile);
+ namesFile >> redundantNames[i]; m->gobble(namesFile);
+
+ seqFreq[i] = m->getNumNames(redundantNames[i]);
+ }
+ namesFile.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "getRedundantNames");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
+ try {
+
+ uniqueNames.push_back(uniqueName);
+ redundantNames.push_back(redundantName);
+ seqFreq.push_back(m->getNumNames(redundantName));
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "addRedundantName");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
+ try {
+
+ ifstream distFile;
+ m->openInputFile(distFileName, distFile);
+
+ int numSeqs = 0;
+ string name = "";
+
+ distFile >> numSeqs;
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ distances[i * numSeqs + i] = 0.0000;
+
+ distFile >> name;
+
+ for(int j=0;j<i;j++){
+ distFile >> distances[i * numSeqs + j];
+ distances[j * numSeqs + i] = distances[i * numSeqs + j];
+ }
+ }
+
+ distFile.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "getDistanceData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
+ try {
+
+ ifstream listFile;
+ m->openInputFile(listFileName, listFile);
+ double threshold;
+ int numOTUs;
+
+ if(listFile.peek() == 'u'){ m->getline(listFile); }
+
+ while(listFile){
+ listFile >> threshold;
+
+ if(threshold < cutOff){
+ m->getline(listFile);
+ }
+ else{
+ listFile >> numOTUs;
+ otuFreq.resize(numOTUs, 0);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ string otu;
+ listFile >> otu;
+
+ int count = 0;
+
+ string number = "";
+
+ for(int j=0;j<otu.size();j++){
+ if(otu[j] != ','){
+ number += otu[j];
+ }
+ else{
+ int index = atoi(number.c_str());
+ otuData[index] = i;
+ count++;
+ number = "";
+ }
+ }
+
+ int index = atoi(number.c_str());
+ otuData[index] = i;
+ count++;
+
+ otuFreq[i] = count;
+ }
+
+ otuBySeqLookUp.resize(numOTUs);
+
+ int numSeqs = otuData.size();
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+ otuBySeqLookUp[otuData[i]].push_back(i);
+ }
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { return 0; }
+ for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
+ otuBySeqLookUp[i].push_back(0);
+ }
+ }
+
+ break;
+ }
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "getListData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int seqNoise::updateOTUCountData(vector<int> otuFreq,
+ vector<vector<int> > otuBySeqLookUp,
+ vector<vector<int> > aanI,
+ vector<int>& anP,
+ vector<int>& anI,
+ vector<int>& cumCount
+ ){
+ try {
+ int numOTUs = otuFreq.size();
+
+ int count = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ cumCount[i] = count;
+
+ if (m->control_pressed) { return 0; }
+
+ for(int j=0;j<otuFreq[i];j++){
+ anP[count] = otuBySeqLookUp[i][j];
+ anI[count] = aanI[i][j];
+
+ count++;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "updateOTUCountData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double seqNoise::calcNewWeights(
+ vector<double>& weights, //
+ vector<int> seqFreq, //
+ vector<int> anI, //
+ vector<int> cumCount, //
+ vector<int> anP, //
+ vector<int> otuFreq, //
+ vector<double> tau //
+ ){
+ try {
+
+ int numOTUs = weights.size();
+ double maxChange = -1;
+
+ cout.flush();
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ double change = weights[i];
+
+ weights[i] = 0.0000;
+
+ for(int j=0;j<otuFreq[i];j++){
+
+ int index1 = cumCount[i] + j;
+ int index2 = anI[index1];
+
+ double currentTau = tau[anP[index1]];
+ double freq = double(seqFreq[index2]);
+
+ weights[i] += currentTau * freq;
+ }
+ change = fabs(weights[i] - change);
+
+ if(change > maxChange){ maxChange = change; }
+ cout.flush();
+ }
+ return maxChange;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "calcNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::calcCentroids(
+ vector<int> anI,
+ vector<int> anP,
+ vector<int>& change,
+ vector<int>& centroids,
+ vector<int> cumCount,
+ vector<double> distances,///
+ vector<int> seqFreq,
+ vector<int> otuFreq,
+ vector<double> tau
+ ){
+ try {
+ int numOTUs = change.size();
+ int numSeqs = seqFreq.size();
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ int minFIndex = -1;
+ double minFValue = 1e10;
+
+ change[i] = 0;
+ double count = 0.00000;
+
+ int freqOfOTU = otuFreq[i];
+
+ for(int j=0;j<freqOfOTU;j++){
+ int index = cumCount[i] + j;
+ count += seqFreq[anI[index]]*tau[anP[index]];
+ }
+
+ if(freqOfOTU > 0 && count > MIN_COUNT){
+
+ vector<double> adF(freqOfOTU);
+ vector<int> anL(freqOfOTU);
+
+ for(int j=0;j<freqOfOTU;j++){
+ anL[j] = anI[cumCount[i] + j];
+ adF[j] = 0.0000;
+ }
+
+ for(int j=0;j<freqOfOTU;j++){
+ int index = cumCount[i] + j;
+ double curTau = tau[anP[index]];
+
+ for(int k=0;k<freqOfOTU;k++){
+ double dist = distances[anL[j]*numSeqs + anL[k]];
+
+ adF[k] += dist * curTau * seqFreq[anL[j]];
+ }
+ }
+
+ for(int j=0;j<freqOfOTU;j++){
+ if(adF[j] < minFValue){
+ minFIndex = j;
+ minFValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFIndex]){
+ change[i] = 1;
+ centroids[i] = anL[minFIndex];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "calcCentroids");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
+ try {
+ int numOTUs = centroids.size();
+ vector<int> unique(numOTUs, 1);
+
+ double minWeight = MIN_WEIGHT;
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { return 0; }
+ if(weights[i] < minWeight){ unique[i] = -1; }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { return 0; }
+ if(unique[i] == 1){
+ for(int j=i+1; j<numOTUs;j++){
+ if(unique[j] == 1){
+ if(centroids[i] == centroids[j]){
+ unique[j] = 0;
+ weights[i] += weights[j];
+ weights[j] = 0;
+ }
+ }
+ }
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "checkCentroids");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
+ try {
+
+ int numOTUs = cumCount.size();
+ int numSeqs = otuData.size();
+
+ vector<double> bestTau(numSeqs, 0);
+ vector<double> bestIndex(numSeqs, -1);
+
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<otuFreq[i];j++){
+
+ int index1 = cumCount[i] + j;
+ double thisTau = tau[anP[index1]];
+ int index2 = anI[index1];
+
+ if(thisTau > bestTau[index2]){
+ bestTau[index2] = thisTau;
+ bestIndex[index2] = i;
+ }
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+ otuData[i] = bestIndex[i];
+ percentage[i] = 1 - bestTau[i];
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "setUpOTUData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
+ try {
+ int numSeqs = otuData.size();
+ int numOTUs = otuFreq.size();
+ int total = numSeqs;
+
+ otuFreq.assign(numOTUs, 0);
+ tau.assign(numSeqs, 1);
+ anP.assign(numSeqs, 0);
+ anI.assign(numSeqs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+ int otu = otuData[i];
+ total++;
+
+ otuBySeqLookUp[otu][otuFreq[otu]] = i;
+ aanI[otu][otuFreq[otu]] = i;
+ otuFreq[otu]++;
+ }
+ updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "finishOTUData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
+ try{
+ char nullReturn = -1;
+
+ while(i>=1 && j>=1){
+ if (m->control_pressed) { return nullReturn; }
+ if(direction == 'd'){
+ if(seqA[i-1] == seqB[j-1]) { return seqA[i-1]; }
+ else { return nullReturn; }
+ }
+
+ else if(direction == 'l') { j--; }
+ else { i--; }
+
+ direction = alignMoves[i][j];
+ }
+
+ return nullReturn;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "getLastMatch");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::countDiffs(vector<int> query, vector<int> ref){
+ try {
+ //double MATCH = 5.0;
+ //double MISMATCH = -2.0;
+ //double GAP = -2.0;
+
+ vector<vector<double> > correctMatrix(4);
+ for(int i=0;i<4;i++){ correctMatrix[i].resize(4); }
+
+ correctMatrix[0][0] = 0.000000; //AA
+ correctMatrix[1][0] = 11.619259; //CA
+ correctMatrix[2][0] = 11.694004; //TA
+ correctMatrix[3][0] = 7.748623; //GA
+
+ correctMatrix[1][1] = 0.000000; //CC
+ correctMatrix[2][1] = 7.619657; //TC
+ correctMatrix[3][1] = 12.852562; //GC
+
+ correctMatrix[2][2] = 0.000000; //TT
+ correctMatrix[3][2] = 10.964048; //TG
+
+ correctMatrix[3][3] = 0.000000; //GG
+
+ for(int i=0;i<4;i++){
+ for(int j=0;j<i;j++){
+ correctMatrix[j][i] = correctMatrix[i][j];
+ }
+ }
+
+ int queryLength = query.size();
+ int refLength = ref.size();
+
+ vector<vector<double> > alignMatrix(queryLength + 1);
+ vector<vector<char> > alignMoves(queryLength + 1);
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i].resize(refLength + 1, 0);
+ alignMoves[i].resize(refLength + 1, 'x');
+ }
+
+ for(int i=0;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[i][0] = 15.0 * i;
+ alignMoves[i][0] = 'u';
+ }
+
+ for(int i=0;i<=refLength;i++){
+ if (m->control_pressed) { return 0; }
+ alignMatrix[0][i] = 15.0 * i;
+ alignMoves[0][i] = 'l';
+ }
+
+ for(int i=1;i<=queryLength;i++){
+ if (m->control_pressed) { return 0; }
+ for(int j=1;j<=refLength;j++){
+
+ double nogap;
+ nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
+
+
+ double gap;
+ double left;
+ if(i == queryLength){ //terminal gap
+ left = alignMatrix[i][j-1];
+ }
+ else{
+ if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ left = alignMatrix[i][j-1] + gap;
+ }
+
+
+ double up;
+ if(j == refLength){ //terminal gap
+ up = alignMatrix[i-1][j];
+ }
+ else{
+
+ if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
+ gap = 4.0;
+ }
+ else{
+ gap = 15.0;
+ }
+
+ up = alignMatrix[i-1][j] + gap;
+ }
+
+
+
+ if(nogap < left){
+ if(nogap < up){
+ alignMoves[i][j] = 'd';
+ alignMatrix[i][j] = nogap;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ else{
+ if(left < up){
+ alignMoves[i][j] = 'l';
+ alignMatrix[i][j] = left;
+ }
+ else{
+ alignMoves[i][j] = 'u';
+ alignMatrix[i][j] = up;
+ }
+ }
+ }
+ }
+
+ int i = queryLength;
+ int j = refLength;
+ int diffs = 0;
+
+ // string alignA = "";
+ // string alignB = "";
+ // string bases = "ACTG";
+
+ while(i > 0 && j > 0){
+ if (m->control_pressed) { return 0; }
+ if(alignMoves[i][j] == 'd'){
+ // alignA = bases[query[i-1]] + alignA;
+ // alignB = bases[ref[j-1]] + alignB;
+
+ if(query[i-1] != ref[j-1]) { diffs++; }
+
+ i--;
+ j--;
+ }
+ else if(alignMoves[i][j] == 'u'){
+ if(j != refLength){
+ // alignA = bases[query[i-1]] + alignA;
+ // alignB = '-' + alignB;
+
+ diffs++;
+ }
+
+ i--;
+ }
+ else if(alignMoves[i][j] == 'l'){
+ if(i != queryLength){
+ // alignA = '-' + alignA;
+ // alignB = bases[ref[j-1]] + alignB;
+
+ diffs++;
+ }
+
+ j--;
+ }
+ }
+
+ // cout << diffs << endl;
+ // cout << alignA << endl;
+ // cout << alignB << endl;
+ // cout << endl;
+
+ return diffs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "countDiffs");
+ exit(1);
+ }
+
+}
+
+/**************************************************************************************************/
+
+vector<int> seqNoise::convertSeq(string bases){
+ try {
+ vector<int> numbers(bases.length(), -1);
+
+ for(int i=0;i<bases.length();i++){
+ if (m->control_pressed) { return numbers; }
+
+ char b = bases[i];
+
+ if(b == 'A') { numbers[i] = 0; }
+ else if(b=='C') { numbers[i] = 1; }
+ else if(b=='T') { numbers[i] = 2; }
+ else if(b=='G') { numbers[i] = 3; }
+ else { numbers[i] = 0; }
+ }
+
+ return numbers;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "convertSeq");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string seqNoise::degapSeq(string aligned){
+ try {
+ string unaligned = "";
+
+ for(int i=0;i<aligned.length();i++){
+
+ if (m->control_pressed) { return ""; }
+
+ if(aligned[i] != '-' && aligned[i] != '.'){
+ unaligned += aligned[i];
+ }
+ }
+
+ return unaligned;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "degapSeq");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
+ try {
+ int numOTUs = finalTau.size();
+ int numSeqs = uniqueNames.size();
+
+ ofstream fastaFile(fastaFileName.c_str());
+ ofstream namesFile(namesFileName.c_str());
+ ofstream uMapFile(uMapFileName.c_str());
+
+ vector<int> maxSequenceAbund(numOTUs, 0);
+ vector<int> maxSequenceIndex(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+ if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
+ maxSequenceAbund[otuData[i]] = seqFreq[i];
+ maxSequenceIndex[otuData[i]] = i;
+ }
+ }
+
+ int count = 1;
+
+ for(int i=0;i<numOTUs;i++){
+ if (m->control_pressed) { return 0; }
+
+ if(finalTau[i] > 0){
+
+ if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
+ // cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
+ centroids[i] = maxSequenceIndex[i];
+ }
+
+ int index = centroids[i];
+
+ fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
+ namesFile << uniqueNames[index] << '\t';
+
+ string refSeq = sequences[index];
+ string redundantSeqs = redundantNames[index];;
+
+
+ vector<freqData> frequencyData;
+
+ for(int j=0;j<numSeqs;j++){
+ if(otuData[j] == i && j != index){
+ frequencyData.push_back(freqData(j, seqFreq[j]));
+ }
+ }
+ sort(frequencyData.rbegin(), frequencyData.rend());
+
+ string refDegap = degapSeq(refSeq);
+ vector<int> rUnalign = convertSeq(refDegap);
+
+ uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
+ uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
+
+
+ for(int j=0;j<frequencyData.size();j++){
+ if (m->control_pressed) { return 0; }
+ redundantSeqs += ',' + redundantNames[frequencyData[j].index];
+
+ uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
+
+ string querySeq = sequences[frequencyData[j].index];
+
+ string queryDegap = degapSeq(querySeq);
+ vector<int> qUnalign = convertSeq(queryDegap);
+
+ int udiffs = countDiffs(qUnalign, rUnalign);
+ uMapFile << udiffs << '\t' << queryDegap << endl;
+
+ }
+
+ uMapFile << endl;
+ namesFile << redundantSeqs << endl;
+ count++;
+
+ }
+ }
+ fastaFile.close();
+ namesFile.close();
+ uMapFile.close();
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "seqNoise", "writeOutput");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************
+
+int main(int argc, char *argv[]){
+
+ double sigma = 100;
+ sigma = atof(argv[5]);
+
+ double cutOff = 0.08;
+ int minIter = 10;
+ int maxIter = 1000;
+ double minDelta = 1e-6;
+
+ string sequenceFileName = argv[1];
+ string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
+
+ vector<string> sequences;
+ getSequenceData(sequenceFileName, sequences);
+
+ int numSeqs = sequences.size();
+
+ vector<string> uniqueNames(numSeqs);
+ vector<string> redundantNames(numSeqs);
+ vector<int> seqFreq(numSeqs);
+
+ string namesFileName = argv[4];
+ getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
+
+ string distFileName = argv[2];
+ vector<double> distances(numSeqs * numSeqs);
+ getDistanceData(distFileName, distances);
+
+ string listFileName = argv[3];
+ vector<int> otuData(numSeqs);
+ vector<int> otuFreq;
+ vector<vector<int> > otuBySeqLookUp;
+
+ getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+
+ int numOTUs = otuFreq.size();
+
+ vector<double> weights(numOTUs, 0);
+ vector<int> change(numOTUs, 1);
+ vector<int> centroids(numOTUs, -1);
+ vector<int> cumCount(numOTUs, 0);
+
+ vector<double> tau(numSeqs, 1);
+ vector<int> anP(numSeqs, 0);
+ vector<int> anI(numSeqs, 0);
+ vector<int> anN(numSeqs, 0);
+ vector<vector<int> > aanI = otuBySeqLookUp;
+
+ int numIters = 0;
+ double maxDelta = 1e6;
+
+ while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+
+ updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+ maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
+
+ calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
+ checkCentroids(weights, centroids);
+
+ otuFreq.assign(numOTUs, 0);
+
+ int total = 0;
+
+ for(int i=0;i<numSeqs;i++){
+ double offset = 1e6;
+ double norm = 0.0000;
+ double minWeight = MIN_WEIGHT;
+ vector<double> currentTau(numOTUs);
+
+ for(int j=0;j<numOTUs;j++){
+ if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+ offset = distances[i * numSeqs+centroids[j]];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weights[j] > minWeight){
+ currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+ norm += currentTau[j];
+ }
+ else{
+ currentTau[j] = 0.0000;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ currentTau[j] /= norm;
+ }
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(currentTau[j] > MIN_TAU){
+ int oldTotal = total;
+ total++;
+
+ tau.resize(oldTotal+1);
+ tau[oldTotal] = currentTau[j];
+ otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+ aanI[j][otuFreq[j]] = i;
+ otuFreq[j]++;
+
+ }
+ }
+
+ anP.resize(total);
+ anI.resize(total);
+ }
+
+ numIters++;
+ }
+
+ updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+
+ vector<double> percentage(numSeqs);
+ setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
+ finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
+
+ change.assign(numOTUs, 1);
+ calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
+
+
+ vector<int> finalTau(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++){
+ finalTau[otuData[i]] += int(seqFreq[i]);
+ }
+
+ writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+
+ return 0;
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef SEQNOISE
+#define SEQNOISE
+
+
+
+/*
+ * mySeqNoise.h
+ *
+ *
+ * Created by Pat Schloss on 8/31/11.
+ * Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/*****************************************************************************************************************************/
+/*****************************************************************************************************************************/
+/* NOTE: Order matters in this class. If you are going to use it, make sure your files have the sequences in the same order. */
+/*****************************************************************************************************************************/
+/*****************************************************************************************************************************/
+
+
+#include "mothurout.h"
+/**************************************************************************************************/
+
+struct freqData {
+
+ freqData(int i, int freq) : frequency(freq), index(i){ }
+
+ bool operator<( freqData const& rhs ) const {
+ return frequency < rhs.frequency;
+ }
+
+ int frequency;
+ int index;
+
+};
+/**************************************************************************************************/
+
+class seqNoise {
+public:
+ seqNoise() { m = MothurOut::getInstance(); }
+ ~seqNoise(){}
+
+ int getSequenceData(string, vector<string>&);
+ int addSeq(string, vector<string>&);
+ int getRedundantNames(string, vector<string>&, vector<string>&, vector<int>&);
+ int addRedundantName(string, string, vector<string>&, vector<string>&, vector<int>&);
+ int getDistanceData(string, vector<double>&);
+ int getListData(string, double, vector<int>&, vector<int>&, vector<vector<int> >&);
+ int updateOTUCountData(vector<int>, vector<vector<int> >, vector<vector<int> >, vector<int>&, vector<int>&, vector<int>&);
+ double calcNewWeights(vector<double>&,vector<int>,vector<int>,vector<int>,vector<int>,vector<int>,vector<double>);
+ int calcCentroids(vector<int>,vector<int>,vector<int>&,vector<int>&,vector<int>,vector<double>,vector<int>,vector<int>,vector<double>);
+ int checkCentroids(vector<double>&, vector<int>);
+ int setUpOTUData(vector<int>&, vector<double>&, vector<int>, vector<double>, vector<int>, vector<int>, vector<int>);
+ int finishOTUData(vector<int>, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<double>&);
+ int writeOutput(string, string, string, vector<int>, vector<int>, vector<int>, vector<string>, vector<string>, vector<string>, vector<int>, vector<double>&);
+
+
+private:
+ MothurOut* m;
+
+ int getLastMatch(char, vector<vector<char> >&, int, int, vector<int>&, vector<int>&);
+ int countDiffs(vector<int>, vector<int>);
+ vector<int> convertSeq(string);
+ string degapSeq(string);
+
+};
+
+/**************************************************************************************************/
+#endif
+
//are we at the beginning of the file??
if (m->saveNextLabel == "") {
f >> label;
-
+
//is this a shared file that has headers
if (label == "label") {
//gets "group"
//read in first row since you know there is at least 1 group.
f >> groupN >> num;
-
holdLabel = label;
//add new vector to lookup
}
m->saveNextLabel = nextLabel;
m->setAllGroups(allGroups);
-
}
catch(exception& e) {
m->errorOut(e, "SharedRAbundVector", "SharedRAbundVector");
--- /dev/null
+/*
+ * shhhseqscommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/8/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "shhhseqscommand.h"
+
+
+
+//**********************************************************************************************************************
+vector<string> ShhhSeqsCommand::setParameters(){
+ try {
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
+ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter psigma("sigma", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(psigma);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ShhhSeqsCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The shhh.seqs command reads a fasta and name file and ....\n";
+ helpString += "The shhh.seqs command parameters are fasta, name, group, sigma and processors.\n";
+ helpString += "The fasta parameter allows you to enter the fasta file containing your potentially sequences, and is required, unless you have a valid current fasta file. \n";
+ helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
+ helpString += "The group parameter allows you to provide a group file. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+ helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
+ helpString += "The sigma parameter .... The default is 0.01. \n";
+ helpString += "The shhh.seqs command should be in the following format: \n";
+ helpString += "shhh.seqs(fasta=yourFastaFile, name=yourNameFile) \n";
+ helpString += "Example: shhh.seqs(fasta=AD.align, name=AD.names) \n";
+ helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
+ return helpString;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+ShhhSeqsCommand::ShhhSeqsCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["map"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+ShhhSeqsCommand::ShhhSeqsCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string, string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) {
+ if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["map"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("fasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["fasta"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+ }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not found") {
+ fastafile = m->getFastaFile();
+ if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }
+ else if (fastafile == "not open") { abort = true; }
+ else { m->setFastaFile(fastafile); }
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not found") {
+ namefile = m->getNameFile();
+ if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }
+ else if (namefile == "not open") { namefile = ""; abort = true; }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not found") { groupfile = ""; }
+ else if (groupfile == "not open") { abort = true; groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+
+ string temp = validParameter.validFile(parameters, "sigma", false); if(temp == "not found"){ temp = "0.01"; }
+ convert(temp, sigma);
+
+ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
+ m->setProcessors(temp);
+ convert(temp, processors);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::execute() {
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ if (outputDir == "") { outputDir = m->hasPath(fastafile); }//if user entered a file with a path then preserve it
+ string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.fasta";
+ string nameFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.names";
+ string mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.map";
+
+ if (groupfile != "") {
+ //Parse sequences by group
+ SequenceParser parser(groupfile, fastafile, namefile);
+ vector<string> groups = parser.getNamesOfGroups();
+
+ if (m->control_pressed) { return 0; }
+
+ //clears files
+ ofstream out, out1, out2;
+ m->openOutputFile(outputFileName, out); out.close();
+ m->openOutputFile(nameFileName, out1); out1.close();
+ mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.";
+
+ if(processors == 1) { driverGroups(parser, outputFileName, nameFileName, mapFileName, 0, groups.size(), groups); }
+ else { createProcessesGroups(parser, outputFileName, nameFileName, mapFileName, groups); }
+
+ if (m->control_pressed) { return 0; }
+
+ //deconvolute results by running unique.seqs
+
+
+ if (m->control_pressed) { return 0; }
+
+ }else{
+ vector<string> sequences;
+ vector<string> uniqueNames;
+ vector<string> redundantNames;
+ vector<int> seqFreq;
+
+ seqNoise noise;
+ correctDist* correct = new correctDist(processors);
+
+ //reads fasta and name file and loads them in order
+ readData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq);
+ if (m->control_pressed) { return 0; }
+
+ //calc distances for cluster
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.dist";
+ correct->execute(distFileName);
+ delete correct;
+
+ if (m->control_pressed) { m->mothurRemove(distFileName); return 0; }
+
+ driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, outputFileName, nameFileName, mapFileName);
+ }
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { m->mothurRemove(outputNames[j]); } return 0; }
+
+ outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
+ outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+ outputNames.push_back(mapFileName); outputTypes["map"].push_back(mapFileName);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ //set accnos file as new current accnosfile
+ string current = "";
+ itTypes = outputTypes.find("fasta");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+ }
+
+ itTypes = outputTypes.find("name");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+ }
+
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::readData(correctDist* correct, seqNoise& noise, vector<string>& seqs, vector<string>& uNames, vector<string>& rNames, vector<int>& freq) {
+ try {
+ map<string, string> nameMap;
+ map<string, string>::iterator it;
+ m->readNames(namefile, nameMap);
+ bool error = false;
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ Sequence seq(in); m->gobble(in);
+
+ if (seq.getName() != "") {
+ correct->addSeq(seq.getName(), seq.getAligned());
+
+ it = nameMap.find(seq.getName());
+ if (it != nameMap.end()) {
+ noise.addSeq(seq.getAligned(), seqs);
+ noise.addRedundantName(it->first, it->second, uNames, rNames, freq);
+ }else {
+ m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your namefile, please correct.");
+ error = true;
+ }
+ }
+ }
+ in.close();
+
+ if (error) { m->control_pressed = true; }
+
+ return seqs.size();
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "readData");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::loadData(correctDist* correct, seqNoise& noise, vector<string>& seqs, vector<string>& uNames, vector<string>& rNames, vector<int>& freq, map<string, string>& nameMap, vector<Sequence>& sequences) {
+ try {
+ bool error = false;
+ map<string, string>::iterator it;
+
+ for (int i = 0; i < sequences.size(); i++) {
+
+ if (m->control_pressed) { return 0; }
+
+ if (sequences[i].getName() != "") {
+ correct->addSeq(sequences[i].getName(), sequences[i].getAligned());
+
+ it = nameMap.find(sequences[i].getName());
+ if (it != nameMap.end()) {
+ noise.addSeq(sequences[i].getAligned(), seqs);
+ noise.addRedundantName(it->first, it->second, uNames, rNames, freq);
+ }else {
+ m->mothurOut("[ERROR]: " + sequences[i].getName() + " is in your fasta file and not in your namefile, please correct.");
+ error = true;
+ }
+ }
+ }
+
+ if (error) { m->control_pressed = true; }
+
+ return seqs.size();
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "loadData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int ShhhSeqsCommand::createProcessesGroups(SequenceParser& parser, string newFName, string newNName, string newMName, vector<string> groups) {
+ try {
+
+ vector<int> processIDS;
+ int process = 1;
+
+ //sanity check
+ if (groups.size() < processors) { processors = groups.size(); }
+
+ //divide the groups between the processors
+ vector<linePair> lines;
+ int numGroupsPerProcessor = groups.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numGroupsPerProcessor;
+ int endIndex = (i+1) * numGroupsPerProcessor;
+ if(i == (processors - 1)){ endIndex = groups.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMName, lines[process].start, lines[process].end, groups);
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //do my part
+ driverGroups(parser, newFName, newNName, newMName, lines[0].start, lines[0].end, groups);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+#else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the shhhseqsData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<shhhseqsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ string extension = toString(i) + ".temp";
+
+ shhhseqsData* tempShhhseqs = new shhhseqsData(fastafile, namefile, groupfile, (newFName+extension), (newNName+extension), newMName, groups, m, lines[i].start, lines[i].end, sigma, i);
+ pDataArray.push_back(tempShhhseqs);
+ processIDS.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyShhhSeqsThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+
+ //using the main process as a worker saves time and memory
+ driverGroups(parser, newFName, newNName, newMName, lines[0].start, lines[0].end, groups);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+
+ //append output files
+ for(int i=0;i<processIDS.size();i++){
+ m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
+ m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
+ m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "createProcessesGroups");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int ShhhSeqsCommand::driverGroups(SequenceParser& parser, string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
+ try {
+
+ for (int i = start; i < end; i++) {
+
+ start = time(NULL);
+
+ if (m->control_pressed) { return 0; }
+
+ m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
+
+ map<string, string> thisNameMap;
+ thisNameMap = parser.getNameMap(groups[i]);
+ vector<Sequence> thisSeqs = parser.getSeqs(groups[i]);
+
+ vector<string> sequences;
+ vector<string> uniqueNames;
+ vector<string> redundantNames;
+ vector<int> seqFreq;
+
+ seqNoise noise;
+ correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
+
+ //load this groups info in order
+ loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
+ if (m->control_pressed) { return 0; }
+
+ //calc distances for cluster
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + groups[i] + ".shhh.dist";
+ correct->execute(distFileName);
+ delete correct;
+
+ if (m->control_pressed) { m->mothurRemove(distFileName); return 0; }
+
+ driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map");
+
+ if (m->control_pressed) { return 0; }
+
+ m->appendFiles(newFFile+groups[i], newFFile); m->mothurRemove(newFFile+groups[i]);
+ m->appendFiles(newNFile+groups[i], newNFile); m->mothurRemove(newNFile+groups[i]);
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + groups[i] + "."); m->mothurOutEndLine();
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "driverGroups");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::driver(seqNoise& noise,
+ vector<string>& sequences,
+ vector<string>& uniqueNames,
+ vector<string>& redundantNames,
+ vector<int>& seqFreq,
+ string distFileName, string outputFileName, string nameFileName, string mapFileName) {
+ try {
+ double cutOff = 0.08;
+ int minIter = 10;
+ int maxIter = 1000;
+ double minDelta = 1e-6;
+ int numIters = 0;
+ double maxDelta = 1e6;
+ int numSeqs = sequences.size();
+
+ //run cluster command
+ string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: cluster(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* clusterCommand = new ClusterCommand(inputString);
+ clusterCommand->execute();
+
+ map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
+ string listFileName = filenames["list"][0];
+ string rabundFileName = filenames["rabund"][0]; m->mothurRemove(rabundFileName);
+ string sabundFileName = filenames["sabund"][0]; m->mothurRemove(sabundFileName);
+
+ delete clusterCommand;
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ if (m->control_pressed) { m->mothurRemove(distFileName); m->mothurRemove(listFileName); return 0; }
+
+ vector<double> distances(numSeqs * numSeqs);
+ noise.getDistanceData(distFileName, distances);
+ m->mothurRemove(distFileName);
+ if (m->control_pressed) { m->mothurRemove(listFileName); return 0; }
+
+ vector<int> otuData(numSeqs);
+ vector<int> otuFreq;
+ vector<vector<int> > otuBySeqLookUp;
+ noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+ m->mothurRemove(listFileName);
+ if (m->control_pressed) { return 0; }
+
+ int numOTUs = otuFreq.size();
+
+ vector<double> weights(numOTUs, 0);
+ vector<int> change(numOTUs, 1);
+ vector<int> centroids(numOTUs, -1);
+ vector<int> cumCount(numOTUs, 0);
+
+ vector<double> tau(numSeqs, 1);
+ vector<int> anP(numSeqs, 0);
+ vector<int> anI(numSeqs, 0);
+ vector<int> anN(numSeqs, 0);
+ vector<vector<int> > aanI = otuBySeqLookUp;
+
+ while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+
+ if (m->control_pressed) { return 0; }
+
+ noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (m->control_pressed) { return 0; }
+ maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); if (m->control_pressed) { return 0; }
+
+ noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; }
+ noise.checkCentroids(weights, centroids); if (m->control_pressed) { return 0; }
+
+ otuFreq.assign(numOTUs, 0);
+
+ int total = 0;
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+
+ double offset = 1e6;
+ double norm = 0.0000;
+ double minWeight = 0.1;
+ vector<double> currentTau(numOTUs);
+
+ for(int j=0;j<numOTUs;j++){
+ if (m->control_pressed) { return 0; }
+ if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+ offset = distances[i * numSeqs+centroids[j]];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (m->control_pressed) { return 0; }
+ if(weights[j] > minWeight){
+ currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+ norm += currentTau[j];
+ }
+ else{
+ currentTau[j] = 0.0000;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (m->control_pressed) { return 0; }
+ currentTau[j] /= norm;
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (m->control_pressed) { return 0; }
+
+ if(currentTau[j] > 1.0e-4){
+ int oldTotal = total;
+ total++;
+
+ tau.resize(oldTotal+1);
+ tau[oldTotal] = currentTau[j];
+ otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+ aanI[j][otuFreq[j]] = i;
+ otuFreq[j]++;
+
+ }
+ }
+
+ anP.resize(total);
+ anI.resize(total);
+ }
+
+ numIters++;
+ }
+
+ noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (m->control_pressed) { return 0; }
+
+ vector<double> percentage(numSeqs);
+ noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); if (m->control_pressed) { return 0; }
+ noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); if (m->control_pressed) { return 0; }
+
+ change.assign(numOTUs, 1);
+ noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; }
+
+
+ vector<int> finalTau(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { return 0; }
+ finalTau[otuData[i]] += int(seqFreq[i]);
+ }
+
+ noise.writeOutput(outputFileName, nameFileName, mapFileName, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+
+ return 0;
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "driver");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::deconvoluteResults(string fastaFile, string nameFile){
+ try {
+ m->mothurOutEndLine(); m->mothurOut("Deconvoluting results:"); m->mothurOutEndLine(); m->mothurOutEndLine();
+
+ //use unique.seqs to create new name and fastafile
+ string inputString = "fasta=" + fastaFile + ", name=" + nameFile;
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* uniqueCommand = new DeconvoluteCommand(inputString);
+ uniqueCommand->execute();
+
+ map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+
+ delete uniqueCommand;
+
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ string newnameFile = filenames["name"][0];
+ string newfastaFile = filenames["fasta"][0];
+
+ m->mothurRemove(fastaFile); rename(newfastaFile.c_str(), fastaFile.c_str());
+ m->mothurRemove(nameFile); rename(newnameFile.c_str(), nameFile.c_str());
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhhSeqsCommand", "deconvoluteResults");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
+
--- /dev/null
+#ifndef SHHHSEQSCOMMAND_H
+#define SHHHSEQSCOMMAND_H
+
+/*
+ * shhhseqscommand.h
+ * Mothur
+ *
+ * Created by westcott on 11/8/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "myseqdist.h"
+#include "seqnoise.h"
+#include "sequenceparser.h"
+#include "deconvolutecommand.h"
+#include "clustercommand.h"
+
+//**********************************************************************************************************************
+
+class ShhhSeqsCommand : public Command {
+
+public:
+ ShhhSeqsCommand(string);
+ ShhhSeqsCommand();
+ ~ShhhSeqsCommand() {}
+
+ vector<string> setParameters();
+ string getCommandName() { return "shhh.seqs"; }
+ string getCommandCategory() { return "Sequence Processing"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Shhh.seqs"; }
+ string getDescription() { return "shhh.seqs"; }
+
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+
+ bool abort;
+ string outputDir, fastafile, namefile, groupfile;
+ int processors;
+ double sigma;
+ vector<string> outputNames;
+
+ int readData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&);
+ int loadData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, map<string, string>&, vector<Sequence>&);
+
+ int driver(seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, string, string, string, string);
+ int driverGroups(SequenceParser&, string, string, string, int, int, vector<string>);
+ int createProcessesGroups(SequenceParser&, string, string, string, vector<string>);
+ int deconvoluteResults(string, string);
+
+
+
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct shhhseqsData {
+ string fastafile;
+ string namefile;
+ string groupfile;
+ string newFName, newNName, newMName;
+ MothurOut* m;
+ int start;
+ int end;
+ int sigma, threadID;
+ vector<string> groups;
+
+ shhhseqsData(){}
+ shhhseqsData(string f, string n, string g, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int s, int tid) {
+ fastafile = f;
+ namefile = n;
+ groupfile = g;
+ newFName = nff;
+ newNName = nnf;
+ newNName = nmf;
+ m = mout;
+ start = st;
+ end = en;
+ sigma = s;
+ threadID = tid;
+ groups = gr;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){
+ shhhseqsData* pDataArray;
+ pDataArray = (shhhseqsData*)lpParam;
+
+ try {
+
+ //parse fasta and name file by group
+ SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);
+
+ //precluster each group
+ for (int k = pDataArray->start; k < pDataArray->end; k++) {
+
+ int start = time(NULL);
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
+
+ map<string, string> thisNameMap;
+ thisNameMap = parser.getNameMap(pDataArray->groups[k]);
+ vector<Sequence> thisSeqs = parser.getSeqs(pDataArray->groups[k]);
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ vector<string> sequences;
+ vector<string> uniqueNames;
+ vector<string> redundantNames;
+ vector<int> seqFreq;
+
+ seqNoise noise;
+ correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
+
+ //load this groups info in order
+ //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
+ //////////////////////////////////////////////////////////////////////////////////////////////////
+ bool error = false;
+ map<string, string>::iterator it;
+
+ for (int i = 0; i < thisSeqs.size(); i++) {
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ if (thisSeqs[i].getName() != "") {
+ correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned());
+
+ it = thisNameMap.find(thisSeqs[i].getName());
+ if (it != thisNameMap.end()) {
+ noise.addSeq(thisSeqs[i].getAligned(), sequences);
+ noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq);
+ }else {
+ pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct.");
+ error = true;
+ }
+ }
+ }
+
+ if (error) { return 0; }
+ //////////////////////////////////////////////////////////////////////////////////////////////////
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ //calc distances for cluster
+ string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist";
+ correct->execute(distFileName);
+ delete correct;
+
+ if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; }
+
+ //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map");
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ double cutOff = 0.08;
+ int minIter = 10;
+ int maxIter = 1000;
+ double minDelta = 1e-6;
+ int numIters = 0;
+ double maxDelta = 1e6;
+ int numSeqs = sequences.size();
+
+ //run cluster command
+ string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
+ pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine();
+
+ Command* clusterCommand = new ClusterCommand(inputString);
+ clusterCommand->execute();
+
+ map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
+ string listFileName = filenames["list"][0];
+ string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName);
+ string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName);
+
+ delete clusterCommand;
+ pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine();
+
+ if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; }
+
+ vector<double> distances(numSeqs * numSeqs);
+ noise.getDistanceData(distFileName, distances);
+ pDataArray->m->mothurRemove(distFileName);
+ if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; }
+
+ vector<int> otuData(numSeqs);
+ vector<int> otuFreq;
+ vector<vector<int> > otuBySeqLookUp;
+ noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+ pDataArray->m->mothurRemove(listFileName);
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ int numOTUs = otuFreq.size();
+
+ vector<double> weights(numOTUs, 0);
+ vector<int> change(numOTUs, 1);
+ vector<int> centroids(numOTUs, -1);
+ vector<int> cumCount(numOTUs, 0);
+
+ vector<double> tau(numSeqs, 1);
+ vector<int> anP(numSeqs, 0);
+ vector<int> anI(numSeqs, 0);
+ vector<int> anN(numSeqs, 0);
+ vector<vector<int> > aanI = otuBySeqLookUp;
+
+ while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
+ maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
+
+ noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
+ noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; }
+
+ otuFreq.assign(numOTUs, 0);
+
+ int total = 0;
+
+ for(int i=0;i<numSeqs;i++){
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ double offset = 1e6;
+ double norm = 0.0000;
+ double minWeight = 0.1;
+ vector<double> currentTau(numOTUs);
+
+ for(int j=0;j<numOTUs;j++){
+ if (pDataArray->m->control_pressed) { return 0; }
+ if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+ offset = distances[i * numSeqs+centroids[j]];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (pDataArray->m->control_pressed) { return 0; }
+ if(weights[j] > minWeight){
+ currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+ norm += currentTau[j];
+ }
+ else{
+ currentTau[j] = 0.0000;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (pDataArray->m->control_pressed) { return 0; }
+ currentTau[j] /= norm;
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ if(currentTau[j] > 1.0e-4){
+ int oldTotal = total;
+ total++;
+
+ tau.resize(oldTotal+1);
+ tau[oldTotal] = currentTau[j];
+ otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+ aanI[j][otuFreq[j]] = i;
+ otuFreq[j]++;
+
+ }
+ }
+
+ anP.resize(total);
+ anI.resize(total);
+ }
+
+ numIters++;
+ }
+
+ noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
+
+ vector<double> percentage(numSeqs);
+ noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI); if (pDataArray->m->control_pressed) { return 0; }
+ noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau); if (pDataArray->m->control_pressed) { return 0; }
+
+ change.assign(numOTUs, 1);
+ noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
+
+
+ vector<int> finalTau(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++){
+ if (pDataArray->m->control_pressed) { return 0; }
+ finalTau[otuData[i]] += int(seqFreq[i]);
+ }
+
+ noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]);
+ pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]);
+
+ pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine();
+
+
+ }
+
+ return 0;
+
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction");
+ exit(1);
+ }
+}
+#endif
+/**************************************************************************************************/
+
+#endif
ofstream output;
if(allFiles){
-
+ set<string> namesAlreadyProcessed;
flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
m->openOutputFile(flowFilesFileName, output);
for(int i=0;i<barcodePrimerComboFileNames.size();i++){
for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
-
- FILE * pFile;
- unsigned long long size;
-
- //get num bytes in file
- pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
- if (pFile==NULL) perror ("Error opening file");
- else{
- fseek (pFile, 0, SEEK_END);
- size=ftell(pFile);
- fclose (pFile);
- }
-
- if(size < 10){
- m->mothurRemove(barcodePrimerComboFileNames[i][j]);
- }
- else{
- output << barcodePrimerComboFileNames[i][j] << endl;
- outputNames.push_back(barcodePrimerComboFileNames[i][j]);
- outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
+ if (namesAlreadyProcessed.count(barcodePrimerComboFileNames[i][j]) == 0) {
+ FILE * pFile;
+ unsigned long long size;
+
+ //get num bytes in file
+ pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
+ if (pFile==NULL) perror ("Error opening file");
+ else{
+ fseek (pFile, 0, SEEK_END);
+ size=ftell(pFile);
+ fclose (pFile);
+ }
+
+ if(size < 10){
+ m->mothurRemove(barcodePrimerComboFileNames[i][j]);
+ }
+ else{
+ output << barcodePrimerComboFileNames[i][j] << endl;
+ outputNames.push_back(barcodePrimerComboFileNames[i][j]);
+ outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
+ }
+ namesAlreadyProcessed.insert(barcodePrimerComboFileNames[i][j]);
}
}
}