X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=shhhercommand.h;h=c787fe41d5c0f2a30975feb9461bdaa3194aae18;hp=b423b4d320acd81ecb1c2e67b1f4ba6c3ad9d3fc;hb=615301e57c25e241356a9c2380648d117709458d;hpb=a3ab9317d12127ff88b039c9933e98eeb562eaa0 diff --git a/shhhercommand.h b/shhhercommand.h index b423b4d..c787fe4 100644 --- a/shhhercommand.h +++ b/shhhercommand.h @@ -12,46 +12,142 @@ #include "mothur.h" #include "command.hpp" -#include "globaldata.hpp" +#include "readcolumn.h" +#include "readmatrix.hpp" +#include "rabundvector.hpp" +#include "sabundvector.hpp" +#include "listvector.hpp" +#include "cluster.hpp" +#include + +//********************************************************************************************************************** + +#define NUMBINS 1000 +#define HOMOPS 10 +#define MIN_COUNT 0.1 +#define MIN_WEIGHT 0.1 +#define MIN_TAU 0.0001 +#define MIN_ITER 10 +//********************************************************************************************************************** class ShhherCommand : public Command { public: ShhherCommand(string); ShhherCommand(); - ~ShhherCommand(); - vector getRequiredParameters(); - vector getValidParameters(); - vector getRequiredFiles(); - map > getOutputFiles() { return outputTypes; } - int execute(); - void help(); + ~ShhherCommand() {} -private: - GlobalData* globaldata; + vector setParameters(); + string getCommandName() { return "shhh.flows"; } + string getCommandCategory() { return "Sequence Processing"; } - int abort; - map > outputTypes; + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "Schloss PD, Gevers D, Westcott SL (2011). Reducing the effects of PCR amplification and sequencing artifacts on 16S rRNA-based studies. PLoS ONE. 6:e27310.\nQuince C, Lanzen A, Davenport RJ, Turnbaugh PJ (2011). Removing noise from pyrosequenced amplicons. BMC Bioinformatics 12:38.\nQuince C, Lanzén A, Curtis TP, Davenport RJ, Hall N, Head IM, Read LF, Sloan WT (2009). Accurate determination of microbial diversity from 454 pyrosequencing data. Nat. Methods 6:639.\nhttp://www.mothur.org/wiki/Shhh.flows"; } + string getDescription() { return "shhh.flows"; } + - string outputDir, flowFileName, flowFilesFileName, lookupFileName; - int processors, maxIters; - float cutoff, sigma, minDelta; + int execute(); + void help() { m->mothurOut(getHelpString()); } +private: - vector nSeqsBreaks; - vector nOTUsBreaks; - vector flowDataPrI; - vector flowDataIntI; - vector lengths; - vector seqNameVector; + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + bool abort, large; + string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName; + + int processors, maxIters, largeSize; + float cutoff, sigma, minDelta; + string flowOrder; + + vector outputNames; vector singleLookUp; vector jointLookUp; + vector flowFileVector; + + vector parseFlowFiles(string); + int driver(vector, string, string); + int createProcesses(vector); + int getFlowData(string, vector&, vector&, vector&, map&, int&); + int getUniques(int, int, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&); + int flowDistParentFork(int, string, int, vector&, vector&, vector&, vector&, vector&); + float calcPairwiseDist(int, int, int, vector&, vector&, vector&, vector&); + int createNamesFile(int, int, string, vector&, vector&, vector&); + int cluster(string, string, string); + int getOTUData(int numSeqs, string, vector&, vector&, vector&, vector >&, vector >&, vector&, vector&,map&); + int calcCentroidsDriver(int numOTUs, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, int, vector&); + double getDistToCentroid(int, int, int, vector&, vector&, int); + double getNewWeights(int, vector&, vector&, vector&, vector&, vector&); + + double getLikelihood(int, int, vector&, vector&, vector&, vector&, vector&, vector&); + int checkCentroids(int, vector&, vector&); + void calcNewDistances(int, int, vector& , vector&,vector& , vector& change, vector&,vector >&, vector&, vector >&, vector&, vector&, vector&, vector&, int, vector&); + int fill(int, vector&, vector&, vector&, vector&, vector >&, vector >&); + void setOTUs(int, int, vector&, vector&, vector&, vector&, + vector&, vector&, vector&, vector >&, vector >&); + void writeQualities(int, int, string, vector, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector&, vector >&); + void writeSequences(string, int, int, string, vector, vector&, vector&, vector >&, vector&); + void writeNames(string, int, string, vector, vector&, vector >&, vector&); + void writeGroups(string, string, int, vector&); + void writeClusters(string, int, int, vector, vector&, vector&, vector&, vector >&, vector&, vector&, vector&); + + void getSingleLookUp(); + void getJointLookUp(); + double getProbIntensity(int); + + +#ifdef USE_MPI + string flowDistMPI(int, int); + void calcNewDistancesChildMPI(int, int, vector&); + + int pid, ncpus; + + void getFlowData(); + void getUniques(); + + float calcPairwiseDist(int, int); + void flowDistParentFork(string, int, int); + + string createDistFile(int); + string createNamesFile(); + string cluster(string, string); + + void getOTUData(string); + void initPyroCluster(); + void fill(); + void calcCentroids(); + void calcCentroidsDriver(int, int); + double getDistToCentroid(int, int, int); + double getNewWeights(); + double getLikelihood(); + void checkCentroids(); + void calcNewDistances(); + void calcNewDistancesParent(int, int); + void calcNewDistancesChild(int, int, vector&, vector&, vector&); + + + void setOTUs(); + void writeQualities(vector); + void writeSequences(vector); + void writeNames(vector); + void writeGroups(); + void writeClusters(vector); + + vector seqNameVector; + vector lengths; + vector flowDataIntI; + vector flowDataPrI; map nameMap; vector otuData; vector cumNumSeqs; vector nSeqsPerOTU; vector > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices - vector seqNumber; //tMaster->anP: the sequence id number sorted by OTU vector > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI + vector seqNumber; //tMaster->anP: the sequence id number sorted by OTU vector seqIndex; //tMaster->anI; the index that corresponds to seqNumber vector dist; //adDist - distance of sequences to centroids vector change; //did the centroid sequence change? 0 = no; 1 = yes @@ -60,54 +156,1213 @@ private: vector singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs) vector uniqueFlowgrams; vector uniqueCount; - vector uniqueLengths; vector mapSeqToUnique; vector mapUniqueToSeq; + vector uniqueLengths; + int numSeqs, numUniques, numOTUs, numFlowCells; + vector nSeqsBreaks; + vector nOTUsBreaks; + +#endif - int numSeqs, numUniques, numOTUs, numFlowCells; +}; + +/************************************************************************************************** +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct shhhFlowsData { + int threadID, maxIters; + float cutoff, sigma, minDelta; + string flowOrder; + vector singleLookUp; + vector jointLookUp; + vector filenames; + vector outputNames; + string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir; + int start, stop; + MothurOut* m; - void getSingleLookUp(); - void getJointLookUp(); - void getFlowData(); - void getUniques(); - double getProbIntensity(int); - float calcPairwiseDist(int, int); - void flowDistParentFork(string, int, int); + shhhFlowsData(){} + shhhFlowsData(vector f, string cf, string cn, string ou, string flor, vector jl, vector sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) { + filenames = f; + thisCompositeFASTAFileName = cf; + thisCompositeNameFileName = cn; + outputDir = ou; + flowOrder = flor; + m = mout; + start = st; + stop = sp; + cutoff= cut; + sigma = si; + minDelta = mD; + maxIters = mx; + jointLookUp = jl; + singleLookUp = sl; + threadID = tid; + } +}; + +/************************************************************************************************** +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){ + shhhFlowsData* pDataArray; + pDataArray = (shhhFlowsData*)lpParam; - string createDistFile(int); - string createNamesFile(); - string cluster(string, string); + try { + + for(int l=pDataArray->start;lstop;l++){ + + if (pDataArray->m->control_pressed) { break; } + + string flowFileName = pDataArray->filenames[l]; + + pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n"); + pDataArray->m->mothurOut("Reading flowgrams...\n"); + + vector seqNameVector; + vector lengths; + vector flowDataIntI; + vector flowDataPrI; + map nameMap; + vector uniqueFlowgrams; + vector uniqueCount; + vector mapSeqToUnique; + vector mapUniqueToSeq; + vector uniqueLengths; + int numFlowCells; + + //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells); + /***************************************************************************************************** + + ifstream flowFile; + // cout << "herethread " << flowFileName << '\t' << &flowFile << endl; + pDataArray->m->openInputFile(flowFileName, flowFile); + + // cout << "herethread " << flowFileName << endl; + string seqName; + int currentNumFlowCells; + float intensity; + + flowFile >> numFlowCells; + int index = 0;//pcluster + while(!flowFile.eof()){ + + if (pDataArray->m->control_pressed) { flowFile.close(); return 0; } + + flowFile >> seqName >> currentNumFlowCells; + lengths.push_back(currentNumFlowCells); + // cout << "herethread " << seqName << endl; + seqNameVector.push_back(seqName); + nameMap[seqName] = index++;//pcluster + + for(int i=0;i> intensity; + if(intensity > 9.99) { intensity = 9.99; } + int intI = int(100 * intensity + 0.0001); + flowDataIntI.push_back(intI); + } + pDataArray->m->gobble(flowFile); + } + flowFile.close(); + + int numSeqs = seqNameVector.size(); + // cout << numSeqs << endl; + for(int i=0;im->control_pressed) { return 0; } + + int iNumFlowCells = i * numFlowCells; + for(int j=lengths[i];j&, vector&, vector&); - - - void setOTUs(); - void writeQualities(vector); - void writeSequences(vector); - void writeNames(vector); - void writeGroups(); - void writeClusters(vector); + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("Identifying unique flowgrams...\n"); + //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI); + /***************************************************************************************************** + int numUniques = 0; + uniqueFlowgrams.assign(numFlowCells * numSeqs, -1); + uniqueCount.assign(numSeqs, 0); // anWeights + uniqueLengths.assign(numSeqs, 0); + mapSeqToUnique.assign(numSeqs, -1); + mapUniqueToSeq.assign(numSeqs, -1); + + vector uniqueFlowDataIntI(numFlowCells * numSeqs, -1); + + for(int i=0;im->control_pressed) { return 0; } + + int index = 0; + + vector current(numFlowCells); + for(int j=0;j uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; } + break; + } + index++; + } + + if(index == numUniques){ + uniqueLengths[numUniques] = lengths[i]; + uniqueCount[numUniques] = 1; + mapSeqToUnique[i] = numUniques;//anMap + mapUniqueToSeq[numUniques] = i;//anF + + for(int k=0;km->control_pressed) { return 0; } + //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); + + flowDataPrI[i] = 100000000; + + for(int j=0;jm->control_pressed) { return 0; } + + float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]]; + if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; } + } + } + + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("Calculating distances between flowgrams...\n"); + string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist"; + unsigned long long begTime = time(NULL); + double begClock = clock(); + + //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); + /***************************************************************************************************** + ostringstream outStream; + outStream.setf(ios::fixed, ios::floatfield); + outStream.setf(ios::dec, ios::basefield); + outStream.setf(ios::showpoint); + outStream.precision(6); + + int thisbegTime = time(NULL); + double thisbegClock = clock(); + + for(int i=0;im->control_pressed) { break; } + + for(int j=0;jm->control_pressed) { break; } + + int flowAIntI = flowDataIntI[ANumFlowCells + k]; + float flowAPrI = flowDataPrI[ANumFlowCells + k]; + + int flowBIntI = flowDataIntI[BNumFlowCells + k]; + float flowBPrI = flowDataPrI[BNumFlowCells + k]; + flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI; + } + + flowDistance /= (float) minLength; + /***************************************************************************************************** - -#ifdef USE_MPI - string flowDistMPI(int, int); - void calcNewDistancesChildMPI(int, int, vector&); + if(flowDistance < 1e-6){ + outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl; + } + else if(flowDistance <= pDataArray->cutoff){ + outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl; + } + } + if(i % 100 == 0){ + pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime)); + pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + } + + ofstream distFile(distFileName.c_str()); + distFile << outStream.str(); + distFile.close(); + + if (pDataArray->m->control_pressed) {} + else { + pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime)); + pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC)); + pDataArray->m->mothurOutEndLine(); + } + /***************************************************************************************************** - int pid, ncpus; + pDataArray->m->mothurOutEndLine(); + pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n'); + + string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names"; + //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq); + /***************************************************************************************************** + vector duplicateNames(numUniques, ""); + for(int i=0;im->openOutputFile(namesFileName, nameFile); + + for(int i=0;im->control_pressed) { nameFile.close(); return 0; } + nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl; + } + nameFile.close(); + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { return 0; } + + pDataArray->m->mothurOut("\nClustering flowgrams...\n"); + string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list"; + //cluster(listFileName, distFileName, namesFileName); + /***************************************************************************************************** + ReadMatrix* read = new ReadColumnMatrix(distFileName); + read->setCutoff(pDataArray->cutoff); + + NameAssignment* clusterNameMap = new NameAssignment(namesFileName); + clusterNameMap->readMap(); + read->read(clusterNameMap); + + ListVector* list = read->getListVector(); + SparseMatrix* matrix = read->getMatrix(); + + delete read; + delete clusterNameMap; + + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); + + Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest"); + string tag = cluster->getTag(); + + double clusterCutoff = pDataArray->cutoff; + while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){ + + if (pDataArray->m->control_pressed) { break; } + + cluster->update(clusterCutoff); + } + + list->setLabel(toString(pDataArray->cutoff)); + + ofstream listFileOut; + pDataArray->m->openOutputFile(listFileName, listFileOut); + list->print(listFileOut); + listFileOut.close(); + + delete matrix; delete cluster; delete rabund; delete list; + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { return 0; } + + vector otuData; + vector cumNumSeqs; + vector nSeqsPerOTU; + vector > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices + vector > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI + vector seqNumber; //tMaster->anP: the sequence id number sorted by OTU + vector seqIndex; //tMaster->anI; the index that corresponds to seqNumber + + + //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap); + /***************************************************************************************************** + ifstream listFile; + pDataArray->m->openInputFile(listFileName, listFile); + string label; + int numOTUs; + + listFile >> label >> numOTUs; + + otuData.assign(numSeqs, 0); + cumNumSeqs.assign(numOTUs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + aaP.clear();aaP.resize(numOTUs); + + seqNumber.clear(); + aaI.clear(); + seqIndex.clear(); + + string singleOTU = ""; + + for(int i=0;im->control_pressed) { break; } + + listFile >> singleOTU; + + istringstream otuString(singleOTU); + + while(otuString){ + + string seqName = ""; + + for(int j=0;j::iterator nmIt = nameMap.find(seqName); + int index = nmIt->second; + + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + seqName = ""; + } + } + + map::iterator nmIt = nameMap.find(seqName); + + int index = nmIt->second; + nameMap.erase(nmIt); + + otuData[index] = i; + nSeqsPerOTU[i]++; + aaP[i].push_back(index); + + otuString.get(); + } + + sort(aaP[i].begin(), aaP[i].end()); + for(int j=0;jm->control_pressed) { return 0; } + + pDataArray->m->mothurRemove(distFileName); + pDataArray->m->mothurRemove(namesFileName); + pDataArray->m->mothurRemove(listFileName); + + vector dist; //adDist - distance of sequences to centroids + vector change; //did the centroid sequence change? 0 = no; 1 = yes + vector centroids; //the representative flowgram for each cluster m + vector weight; + vector singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs) + vector nSeqsBreaks; + vector nOTUsBreaks; + + dist.assign(numSeqs * numOTUs, 0); + change.assign(numOTUs, 1); + centroids.assign(numOTUs, -1); + weight.assign(numOTUs, 0); + singleTau.assign(numSeqs, 1.0); + + nSeqsBreaks.assign(2, 0); + nOTUsBreaks.assign(2, 0); + + nSeqsBreaks[0] = 0; + nSeqsBreaks[1] = numSeqs; + nOTUsBreaks[1] = numOTUs; + + if (pDataArray->m->control_pressed) { break; } + + double maxDelta = 0; + int iter = 0; + + begClock = clock(); + begTime = time(NULL); + + pDataArray->m->mothurOut("\nDenoising flowgrams...\n"); + pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n"); + + while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){ + + if (pDataArray->m->control_pressed) { break; } + + double cycClock = clock(); + unsigned long long cycTime = time(NULL); + //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); + /***************************************************************************************************** + int indexFill = 0; + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jm->control_pressed) { break; } + + //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); + /***************************************************************************************************** + for(int i=0;im->control_pressed) { break; } + + double count = 0; + int position = 0; + int minFlowGram = 100000000; + double minFlowValue = 1e8; + change[i] = 0; //FALSE + + for(int j=0;j 0 && count > MIN_COUNT){ + vector adF(nSeqsPerOTU[i]); + vector anL(nSeqsPerOTU[i]); + + for(int j=0;jsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist = dist / (double)lengths[nI]; + /***************************************************************************************************** + adF[k] += dist * tauValue; + } + } + + for(int j=0;jm->control_pressed) { break; } + + //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight); + /***************************************************************************************************** + double maxChange = 0; + + for(int i=0;im->control_pressed) { break; } + + double difference = weight[i]; + weight[i] = 0; + + for(int j=0;j maxChange){ maxChange = difference; } + } + maxDelta = maxChange; + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { break; } + + //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); + /***************************************************************************************************** + vector P(numSeqs, 0); + int effNumOTUs = 0; + + for(int i=0;i MIN_WEIGHT){ + effNumOTUs++; + } + } + + string hold; + for(int i=0;im->control_pressed) { break; } + + for(int j=0;jsigma); + } + } + double nLL = 0.00; + for(int i=0;isigma); + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { break; } + + //checkCentroids(numOTUs, centroids, weight); + /***************************************************************************************************** + vector unique(numOTUs, 1); + + for(int i=0;im->control_pressed) { break; } + + if(unique[i] == 1){ + for(int j=i+1;jm->control_pressed) { break; } + + //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths); + /***************************************************************************************************** + int total = 0; + vector newTau(numOTUs,0); + vector norms(numSeqs, 0); + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=0;im->control_pressed) { break; } + + int indexOffset = i * numOTUs; + + double offset = 1e8; + + for(int j=0;j MIN_WEIGHT && change[j] == 1){ + //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells); + /***************************************************************************************************** + int flowAValue = centroids[j] * numFlowCells; + int flowBValue = i * numFlowCells; + + double distTemp = 0; + + for(int l=0;lsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist[indexOffset + j] = distTemp / (double)lengths[i]; + /***************************************************************************************************** + + } + + if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){ + offset = dist[indexOffset + j]; + } + } + + for(int j=0;j MIN_WEIGHT){ + newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j]; + norms[i] += newTau[j]; + } + else{ + newTau[j] = 0.0; + } + } + + for(int j=0;j MIN_TAU){ + + int oldTotal = total; + + total++; + + singleTau.resize(total, 0); + seqNumber.resize(total, 0); + seqIndex.resize(total, 0); + + singleTau[oldTotal] = newTau[j]; + + aaP[j][nSeqsPerOTU[j]] = oldTotal; + aaI[j][nSeqsPerOTU[j]] = i; + nSeqsPerOTU[j]++; + } + } + + } + + /***************************************************************************************************** + + if (pDataArray->m->control_pressed) { break; } + + iter++; + + pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n'); + + } + + if (pDataArray->m->control_pressed) { break; } + + pDataArray->m->mothurOut("\nFinalizing...\n"); + //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI); + /***************************************************************************************************** + int indexFill = 0; + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jm->control_pressed) { break; } + + //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI); + /***************************************************************************************************** + vector bigTauMatrix(numOTUs * numSeqs, 0.0000); + + for(int i=0;im->control_pressed) { break; } + + for(int j=0;j maxTau){ + maxTau = bigTauMatrix[i * numOTUs + j]; + maxOTU = j; + } + } + + otuData[i] = maxOTU; + } + + nSeqsPerOTU.assign(numOTUs, 0); + + for(int i=0;im->control_pressed) { return 0; } + + cumNumSeqs[i] = indexFill; + for(int j=0;jm->control_pressed) { break; } + + vector otuCounts(numOTUs, 0); + for(int i=0;im->control_pressed) { break; } + + double count = 0; + int position = 0; + int minFlowGram = 100000000; + double minFlowValue = 1e8; + change[i] = 0; //FALSE + + for(int j=0;j 0 && count > MIN_COUNT){ + vector adF(nSeqsPerOTU[i]); + vector anL(nSeqsPerOTU[i]); + + for(int j=0;jsingleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]]; + flowAValue++; + flowBValue++; + } + + dist = dist / (double)lengths[nI]; + /***************************************************************************************************** + adF[k] += dist * tauValue; + } + } + + for(int j=0;jm->control_pressed) { break; } + + //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); + if (pDataArray->m->control_pressed) { break; } + /***************************************************************************************************** + string thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual"; + + ofstream qualityFile; + pDataArray->m->openOutputFile(qualityFileName, qualityFile); + + qualityFile.setf(ios::fixed, ios::floatfield); + qualityFile.setf(ios::showpoint); + qualityFile << setprecision(6); + + vector > qualities(numOTUs); + vector pr(HOMOPS, 0); + + + for(int i=0;im->control_pressed) { break; } + + int index = 0; + int base = 0; + + if(nSeqsPerOTU[i] > 0){ + qualities[i].assign(1024, -1); + + while(index < numFlowCells){ + double maxPrValue = 1e8; + short maxPrIndex = -1; + double count = 0.0000; + + pr.assign(HOMOPS, 0); + + for(int j=0;jsingleLookUp[s * NUMBINS + intensity]; + } + } + + maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index]; + maxPrValue = pr[maxPrIndex]; + + if(count > MIN_COUNT){ + double U = 0.0000; + double norm = 0.0000; + + for(int s=0;s0.00){ + temp = log10(U); + } + else{ + temp = -10.1; + } + temp = floor(-10 * temp); + value = (int)floor(temp); + if(value > 100){ value = 100; } + + qualities[i][base] = (int)value; + base++; + } + } + + index++; + } + } + + + if(otuCounts[i] > 0){ + qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl; + + int j=4; //need to get past the first four bases + while(qualities[i][j] != -1){ + qualityFile << qualities[i][j] << ' '; + j++; + } + qualityFile << endl; + } + } + qualityFile.close(); + pDataArray->outputNames.push_back(qualityFileName); + /***************************************************************************************************** + + // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids); + if (pDataArray->m->control_pressed) { break; } + /***************************************************************************************************** + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta"; + ofstream fastaFile; + pDataArray->m->openOutputFile(fastaFileName, fastaFile); + + vector names(numOTUs, ""); + + for(int i=0;im->control_pressed) { break; } + + int index = centroids[i]; + + if(otuCounts[i] > 0){ + fastaFile << '>' << seqNameVector[aaI[i][0]] << endl; + + string newSeq = ""; + + for(int j=0;jflowOrder[j % 4]; + for(int k=0;koutputNames.push_back(fastaFileName); + + if(pDataArray->thisCompositeFASTAFileName != ""){ + pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName); + } + + /***************************************************************************************************** + + //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); + if (pDataArray->m->control_pressed) { break; } + /***************************************************************************************************** + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names"; + ofstream nameFileOut; + pDataArray->m->openOutputFile(nameFileName, nameFileOut); + + for(int i=0;im->control_pressed) { break; } + + if(otuCounts[i] > 0){ + nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]]; + + for(int j=1;joutputNames.push_back(nameFileName); + + + if(pDataArray->thisCompositeNameFileName != ""){ + pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName); + } + /***************************************************************************************************** + + //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); + if (pDataArray->m->control_pressed) { break; } + /***************************************************************************************************** + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts"; + ofstream otuCountsFile; + pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile); + + string bases = pDataArray->flowOrder; + + for(int i=0;im->control_pressed) { + break; + } + //output the translated version of the centroid sequence for the otu + if(otuCounts[i] > 0){ + int index = centroids[i]; + + otuCountsFile << "ideal\t"; + for(int j=8;joutputNames.push_back(otuCountsFileName) + /***************************************************************************************************** + + //writeGroups(flowFileName, numSeqs, seqNameVector); + if (pDataArray->m->control_pressed) { break; } + /***************************************************************************************************** + thisOutputDir = pDataArray->outputDir; + if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); } + string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)); + string groupFileName = fileRoot + "shhh.groups"; + ofstream groupFile; + pDataArray->m->openOutputFile(groupFileName, groupFile); + + for(int i=0;im->control_pressed) { break; } + groupFile << seqNameVector[i] << '\t' << fileRoot << endl; + } + groupFile.close(); + pDataArray->outputNames.push_back(groupFileName); + /***************************************************************************************************** + + pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n'); + } + + if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; } + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction"); + exit(1); + } +} #endif - -}; +*/ +#endif -#endif \ No newline at end of file