+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
+ try {
+
+ int numCompleted = 0;
+
+ for(int i=start;i<end;i++){
+
+ if (m->control_pressed) { break; }
+
+ vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
+ if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
+
+ if (m->control_pressed) { break; }
+
+ double begClock = clock();
+ unsigned long long begTime;
+
+ for (int g = 0; g < theseFlowFileNames.size(); g++) {
+
+ string flowFileName = theseFlowFileNames[g];
+ m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
+ m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
+ int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Identifying unique flowgrams...\n");
+ int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ begTime = time(NULL);
+
+
+ flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ cluster(listFileName, distFileName, namesFileName);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurRemove(distFileName);
+ m->mothurRemove(namesFileName);
+ m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
+
+ if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
+
+ if (m->control_pressed) { break; }
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ m->mothurOut("\nDenoising flowgrams...\n");
+ m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+
+ if (m->control_pressed) { break; }
+
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+
+ if (m->control_pressed) { break; }
+
+ double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+
+ if (m->control_pressed) { break; }
+
+ checkCentroids(numOTUs, centroids, weight);
+
+ if (m->control_pressed) { break; }
+
+ calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+
+ if (m->control_pressed) { break; }
+
+ iter++;
+
+ m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nFinalizing...\n");
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir = m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("counts");
+ string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string groupFileName = fileRoot + getOutputFileNameTag("group");
+
+
+ writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+ writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+ writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
+ writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
+ writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector); if (m->control_pressed) { break; }
+
+ if (large) {
+ if (g > 0) {
+ m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("qfile")));
+ m->mothurRemove(qualityFileName);
+ m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("fasta")));
+ m->mothurRemove(fastaFileName);
+ m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("name")));
+ m->mothurRemove(nameFileName);
+ m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("counts")));
+ m->mothurRemove(otuCountsFileName);
+ m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("group")));
+ m->mothurRemove(groupFileName);
+ }
+ m->mothurRemove(theseFlowFileNames[g]);
+ }
+ }
+
+ numCompleted++;
+ m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ return numCompleted;
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "driver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){