5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "shhhercommand.h"
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21 CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28 vector<string> myArray;
29 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
33 m->errorOut(e, "ShhherCommand", "setParameters");
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){
40 string helpString = "";
41 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
45 m->errorOut(e, "ShhherCommand", "getHelpString");
49 //**********************************************************************************************************************
51 ShhherCommand::ShhherCommand(){
53 abort = true; calledHelp = true;
56 //initialize outputTypes
57 // vector<string> tempOutNames;
58 // outputTypes["pn.dist"] = tempOutNames;
62 m->errorOut(e, "ShhherCommand", "ShhherCommand");
67 //**********************************************************************************************************************
69 ShhherCommand::ShhherCommand(string option) {
73 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
78 abort = false; calledHelp = false;
80 //allow user to run help
81 if(option == "help") { help(); abort = true; calledHelp = true; }
82 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
85 vector<string> myArray = setParameters();
87 OptionParser parser(option);
88 map<string,string> parameters = parser.getParameters();
90 ValidParameters validParameter;
91 map<string,string>::iterator it;
93 //check to make sure all parameters are valid for command
94 for (it = parameters.begin(); it != parameters.end(); it++) {
95 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 // outputTypes["pn.dist"] = tempOutNames;
101 // outputTypes["fasta"] = tempOutNames;
103 //if the user changes the input directory command factory will send this info to us in the output parameter
104 string inputDir = validParameter.validFile(parameters, "inputdir", false);
105 if (inputDir == "not found"){ inputDir = ""; }
108 it = parameters.find("flow");
109 //user has given a template file
110 if(it != parameters.end()){
111 path = m->hasPath(it->second);
112 //if the user has not given a path then, add inputdir. else leave path alone.
113 if (path == "") { parameters["flow"] = inputDir + it->second; }
116 it = parameters.find("lookup");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["lookup"] = inputDir + it->second; }
124 it = parameters.find("file");
125 //user has given a template file
126 if(it != parameters.end()){
127 path = m->hasPath(it->second);
128 //if the user has not given a path then, add inputdir. else leave path alone.
129 if (path == "") { parameters["file"] = inputDir + it->second; }
133 //if the user changes the output directory command factory will send this info to us in the output parameter
134 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
136 //check for required parameters
137 flowFileName = validParameter.validFile(parameters, "flow", true);
138 flowFilesFileName = validParameter.validFile(parameters, "file", true);
139 if (flowFileName == "not found" && flowFilesFileName == "not found") {
140 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141 m->mothurOutEndLine();
144 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
146 if(flowFileName != "not found"){
147 compositeFASTAFileName = "";
148 compositeNamesFileName = "";
153 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
155 //flow.files = 9 character offset
156 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
157 m->openOutputFile(compositeFASTAFileName, temp);
160 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
161 m->openOutputFile(compositeNamesFileName, temp);
165 if(flowFilesFileName != "not found"){
168 ifstream flowFilesFile;
169 m->openInputFile(flowFilesFileName, flowFilesFile);
170 while(flowFilesFile){
171 fName = m->getline(flowFilesFile);
173 //test if file is valid
175 int ableToOpen = m->openInputFile(fName, in, "noerror");
177 if (ableToOpen == 1) {
178 if (inputDir != "") { //default path is set
179 string tryPath = inputDir + fName;
180 m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
182 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
188 if (ableToOpen == 1) {
189 if (m->getDefaultPath() != "") { //default path is set
190 string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
191 m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
193 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
199 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
200 if (ableToOpen == 1) {
201 string exepath = m->argv;
202 string tempPath = exepath;
203 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
204 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
206 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
207 m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
209 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
214 if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
215 else { flowFileVector.push_back(fName); }
216 m->gobble(flowFilesFile);
218 flowFilesFile.close();
219 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
222 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
223 flowFileVector.push_back(flowFileName);
226 //check for optional parameter and set defaults
227 // ...at some point should added some additional type checking...
229 temp = validParameter.validFile(parameters, "lookup", true);
230 if (temp == "not found") {
231 lookupFileName = "LookUp_Titanium.pat";
235 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
238 //if you can't open it, try input location
239 if (ableToOpen == 1) {
240 if (inputDir != "") { //default path is set
241 string tryPath = inputDir + lookupFileName;
242 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
244 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
246 lookupFileName = tryPath;
250 //if you can't open it, try default location
251 if (ableToOpen == 1) {
252 if (m->getDefaultPath() != "") { //default path is set
253 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
254 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
256 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
258 lookupFileName = tryPath;
262 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
263 if (ableToOpen == 1) {
264 string exepath = m->argv;
265 string tempPath = exepath;
266 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
267 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
269 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
270 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
272 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
274 lookupFileName = tryPath;
277 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
279 else if(temp == "not open") {
281 lookupFileName = validParameter.validFile(parameters, "lookup", false);
283 //if you can't open it its not inputDir, try mothur excutable location
284 string exepath = m->argv;
285 string tempPath = exepath;
286 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
287 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
289 string tryPath = m->getFullPathName(exepath) + lookupFileName;
290 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
292 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
294 lookupFileName = tryPath;
296 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
297 }else { lookupFileName = temp; }
299 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
300 m->setProcessors(temp);
301 m->mothurConvert(temp, processors);
303 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
304 m->mothurConvert(temp, cutoff);
306 temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
307 m->mothurConvert(temp, minDelta);
309 temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
310 m->mothurConvert(temp, maxIters);
312 temp = validParameter.validFile(parameters, "large", false); if (temp == "not found"){ temp = "0"; }
313 m->mothurConvert(temp, largeSize);
314 if (largeSize != 0) { large = true; }
315 else { large = false; }
316 if (largeSize < 0) { m->mothurOut("The value of the large cannot be negative.\n"); }
319 if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }
323 temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
324 m->mothurConvert(temp, sigma);
326 flowOrder = validParameter.validFile(parameters, "order", false);
327 if (flowOrder == "not found"){ flowOrder = "TACG"; }
328 else if(flowOrder.length() != 4){
329 m->mothurOut("The value of the order option must be four bases long\n");
337 catch(exception& e) {
338 m->errorOut(e, "ShhherCommand", "ShhherCommand");
342 //**********************************************************************************************************************
344 int ShhherCommand::execute(){
346 if (abort == true) { if (calledHelp) { return 0; } return 2; }
353 for(int i=1;i<ncpus;i++){
354 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
356 if(abort == 1){ return 0; }
360 m->mothurOut("\nGetting preliminary data...\n");
361 getSingleLookUp(); if (m->control_pressed) { return 0; }
362 getJointLookUp(); if (m->control_pressed) { return 0; }
364 vector<string> flowFileVector;
365 if(flowFilesFileName != "not found"){
368 ifstream flowFilesFile;
369 m->openInputFile(flowFilesFileName, flowFilesFile);
370 while(flowFilesFile){
371 fName = m->getline(flowFilesFile);
372 flowFileVector.push_back(fName);
373 m->gobble(flowFilesFile);
377 flowFileVector.push_back(flowFileName);
380 int numFiles = flowFileVector.size();
382 for(int i=1;i<ncpus;i++){
383 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
386 for(int i=0;i<numFiles;i++){
388 if (m->control_pressed) { break; }
390 double begClock = clock();
391 unsigned long long begTime = time(NULL);
393 flowFileName = flowFileVector[i];
395 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
396 m->mothurOut("Reading flowgrams...\n");
399 if (m->control_pressed) { break; }
401 m->mothurOut("Identifying unique flowgrams...\n");
404 if (m->control_pressed) { break; }
406 m->mothurOut("Calculating distances between flowgrams...\n");
408 strcpy(fileName, flowFileName.c_str());
410 for(int i=1;i<ncpus;i++){
411 MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
413 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
414 MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415 MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416 MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
417 MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
418 MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
419 MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
420 MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
421 MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
422 MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
425 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
427 if (m->control_pressed) { break; }
430 for(int i=1;i<ncpus;i++){
431 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
433 m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
434 m->mothurRemove((distFileName + ".temp." + toString(i)));
437 string namesFileName = createNamesFile();
439 if (m->control_pressed) { break; }
441 m->mothurOut("\nClustering flowgrams...\n");
442 string listFileName = cluster(distFileName, namesFileName);
444 if (m->control_pressed) { break; }
448 getOTUData(listFileName);
450 m->mothurRemove(distFileName);
451 m->mothurRemove(namesFileName);
452 m->mothurRemove(listFileName);
454 if (m->control_pressed) { break; }
458 if (m->control_pressed) { break; }
461 for(int i=1;i<ncpus;i++){
462 MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
463 MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
464 MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
465 MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
468 if (m->control_pressed) { break; }
473 int numOTUsOnCPU = numOTUs / ncpus;
474 int numSeqsOnCPU = numSeqs / ncpus;
475 m->mothurOut("\nDenoising flowgrams...\n");
476 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
478 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
480 double cycClock = clock();
481 unsigned long long cycTime = time(NULL);
484 if (m->control_pressed) { break; }
486 int total = singleTau.size();
487 for(int i=1;i<ncpus;i++){
488 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
489 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
490 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
492 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
493 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
494 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
495 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
496 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
499 calcCentroidsDriver(0, numOTUsOnCPU);
501 for(int i=1;i<ncpus;i++){
502 int otuStart = i * numOTUs / ncpus;
503 int otuStop = (i + 1) * numOTUs / ncpus;
505 vector<int> tempCentroids(numOTUs, 0);
506 vector<short> tempChange(numOTUs, 0);
508 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
509 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
511 for(int j=otuStart;j<otuStop;j++){
512 centroids[j] = tempCentroids[j];
513 change[j] = tempChange[j];
517 maxDelta = getNewWeights(); if (m->control_pressed) { break; }
518 double nLL = getLikelihood(); if (m->control_pressed) { break; }
519 checkCentroids(); if (m->control_pressed) { break; }
521 for(int i=1;i<ncpus;i++){
522 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
523 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
524 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
527 calcNewDistancesParent(0, numSeqsOnCPU);
529 total = singleTau.size();
531 for(int i=1;i<ncpus;i++){
533 int seqStart = i * numSeqs / ncpus;
534 int seqStop = (i + 1) * numSeqs / ncpus;
536 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
538 vector<int> childSeqIndex(childTotal, 0);
539 vector<double> childSingleTau(childTotal, 0);
540 vector<double> childDist(numSeqs * numOTUs, 0);
541 vector<int> otuIndex(childTotal, 0);
543 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
545 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
546 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548 int oldTotal = total;
550 singleTau.resize(total, 0);
551 seqIndex.resize(total, 0);
552 seqNumber.resize(total, 0);
556 for(int j=oldTotal;j<total;j++){
557 int otuI = otuIndex[childIndex];
558 int seqI = childSeqIndex[childIndex];
560 singleTau[j] = childSingleTau[childIndex];
562 aaP[otuI][nSeqsPerOTU[otuI]] = j;
563 aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
568 int index = seqStart * numOTUs;
569 for(int j=seqStart;j<seqStop;j++){
570 for(int k=0;k<numOTUs;k++){
571 dist[index] = childDist[index];
579 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
581 if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
583 for(int i=1;i<ncpus;i++){
584 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
589 for(int i=1;i<ncpus;i++){
590 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
596 if (m->control_pressed) { break; }
598 m->mothurOut("\nFinalizing...\n");
601 if (m->control_pressed) { break; }
605 vector<int> otuCounts(numOTUs, 0);
606 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
607 calcCentroidsDriver(0, numOTUs);
609 if (m->control_pressed) { break; }
611 writeQualities(otuCounts); if (m->control_pressed) { break; }
612 writeSequences(otuCounts); if (m->control_pressed) { break; }
613 writeNames(otuCounts); if (m->control_pressed) { break; }
614 writeClusters(otuCounts); if (m->control_pressed) { break; }
615 writeGroups(); if (m->control_pressed) { break; }
618 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
624 MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
625 if(abort){ return 0; }
628 MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
630 for(int i=0;i<numFiles;i++){
632 if (m->control_pressed) { break; }
634 //Now into the pyrodist part
638 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
639 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
640 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643 flowDataIntI.resize(numSeqs * numFlowCells);
644 flowDataPrI.resize(numSeqs * numFlowCells);
645 mapUniqueToSeq.resize(numSeqs);
646 mapSeqToUnique.resize(numSeqs);
647 lengths.resize(numSeqs);
648 jointLookUp.resize(NUMBINS * NUMBINS);
650 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
651 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
652 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
653 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
654 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
655 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
656 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
658 flowFileName = string(fileName);
659 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
660 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
662 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
664 if (m->control_pressed) { break; }
667 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
669 //Now into the pyronoise part
670 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
672 singleLookUp.resize(HOMOPS * NUMBINS);
673 uniqueFlowgrams.resize(numUniques * numFlowCells);
674 weight.resize(numOTUs);
675 centroids.resize(numOTUs);
676 change.resize(numOTUs);
677 dist.assign(numOTUs * numSeqs, 0);
678 nSeqsPerOTU.resize(numOTUs);
679 cumNumSeqs.resize(numOTUs);
681 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
682 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
683 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
685 int startOTU = pid * numOTUs / ncpus;
686 int endOTU = (pid + 1) * numOTUs / ncpus;
688 int startSeq = pid * numSeqs / ncpus;
689 int endSeq = (pid + 1) * numSeqs /ncpus;
695 if (m->control_pressed) { break; }
697 MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698 singleTau.assign(total, 0.0000);
699 seqNumber.assign(total, 0);
700 seqIndex.assign(total, 0);
702 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
703 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704 MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705 MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
706 MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707 MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
708 MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710 calcCentroidsDriver(startOTU, endOTU);
712 MPI_Send(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
713 MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
715 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716 MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
717 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
719 vector<int> otuIndex(total, 0);
720 calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
721 total = otuIndex.size();
723 MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
724 MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
725 MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
726 MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
727 MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
729 MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
734 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
736 MPI_Barrier(MPI_COMM_WORLD);
739 if(compositeFASTAFileName != ""){
740 outputNames.push_back(compositeFASTAFileName);
741 outputNames.push_back(compositeNamesFileName);
744 m->mothurOutEndLine();
745 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
746 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
747 m->mothurOutEndLine();
752 catch(exception& e) {
753 m->errorOut(e, "ShhherCommand", "execute");
757 /**************************************************************************************************/
758 string ShhherCommand::createNamesFile(){
761 vector<string> duplicateNames(numUniques, "");
762 for(int i=0;i<numSeqs;i++){
763 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
766 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
769 m->openOutputFile(nameFileName, nameFile);
771 for(int i=0;i<numUniques;i++){
773 if (m->control_pressed) { break; }
775 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
776 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
782 catch(exception& e) {
783 m->errorOut(e, "ShhherCommand", "createNamesFile");
787 /**************************************************************************************************/
789 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
791 ostringstream outStream;
792 outStream.setf(ios::fixed, ios::floatfield);
793 outStream.setf(ios::dec, ios::basefield);
794 outStream.setf(ios::showpoint);
795 outStream.precision(6);
797 int begTime = time(NULL);
798 double begClock = clock();
800 for(int i=startSeq;i<stopSeq;i++){
802 if (m->control_pressed) { break; }
804 for(int j=0;j<i;j++){
805 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
807 if(flowDistance < 1e-6){
808 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
810 else if(flowDistance <= cutoff){
811 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
815 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
819 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
820 if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
822 if (m->control_pressed) { return fDistFileName; }
824 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
826 ofstream distFile(fDistFileName.c_str());
827 distFile << outStream.str();
830 return fDistFileName;
832 catch(exception& e) {
833 m->errorOut(e, "ShhherCommand", "flowDistMPI");
837 /**************************************************************************************************/
839 void ShhherCommand::getOTUData(string listFileName){
843 m->openInputFile(listFileName, listFile);
846 listFile >> label >> numOTUs;
848 otuData.assign(numSeqs, 0);
849 cumNumSeqs.assign(numOTUs, 0);
850 nSeqsPerOTU.assign(numOTUs, 0);
851 aaP.clear();aaP.resize(numOTUs);
857 string singleOTU = "";
859 for(int i=0;i<numOTUs;i++){
861 if (m->control_pressed) { break; }
863 listFile >> singleOTU;
865 istringstream otuString(singleOTU);
871 for(int j=0;j<singleOTU.length();j++){
872 char letter = otuString.get();
878 map<string,int>::iterator nmIt = nameMap.find(seqName);
879 int index = nmIt->second;
885 aaP[i].push_back(index);
890 map<string,int>::iterator nmIt = nameMap.find(seqName);
892 int index = nmIt->second;
897 aaP[i].push_back(index);
902 sort(aaP[i].begin(), aaP[i].end());
903 for(int j=0;j<nSeqsPerOTU[i];j++){
904 seqNumber.push_back(aaP[i][j]);
906 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
913 for(int i=1;i<numOTUs;i++){
914 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
917 seqIndex = seqNumber;
922 catch(exception& e) {
923 m->errorOut(e, "ShhherCommand", "getOTUData");
928 /**************************************************************************************************/
930 void ShhherCommand::initPyroCluster(){
932 if (numOTUs < processors) { processors = 1; }
934 dist.assign(numSeqs * numOTUs, 0);
935 change.assign(numOTUs, 1);
936 centroids.assign(numOTUs, -1);
937 weight.assign(numOTUs, 0);
938 singleTau.assign(numSeqs, 1.0);
940 nSeqsBreaks.assign(processors+1, 0);
941 nOTUsBreaks.assign(processors+1, 0);
944 for(int i=0;i<processors;i++){
945 nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
946 nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
948 nSeqsBreaks[processors] = numSeqs;
949 nOTUsBreaks[processors] = numOTUs;
951 catch(exception& e) {
952 m->errorOut(e, "ShhherCommand", "initPyroCluster");
957 /**************************************************************************************************/
959 void ShhherCommand::fill(){
962 for(int i=0;i<numOTUs;i++){
964 if (m->control_pressed) { break; }
966 cumNumSeqs[i] = index;
967 for(int j=0;j<nSeqsPerOTU[i];j++){
968 seqNumber[index] = aaP[i][j];
969 seqIndex[index] = aaI[i][j];
975 catch(exception& e) {
976 m->errorOut(e, "ShhherCommand", "fill");
981 /**************************************************************************************************/
983 void ShhherCommand::getFlowData(){
986 m->openInputFile(flowFileName, flowFile);
989 seqNameVector.clear();
991 flowDataIntI.clear();
995 int currentNumFlowCells;
999 flowFile >> numFlowCells;
1000 int index = 0;//pcluster
1001 while(!flowFile.eof()){
1003 if (m->control_pressed) { break; }
1005 flowFile >> seqName >> currentNumFlowCells;
1006 lengths.push_back(currentNumFlowCells);
1008 seqNameVector.push_back(seqName);
1009 nameMap[seqName] = index++;//pcluster
1011 for(int i=0;i<numFlowCells;i++){
1012 flowFile >> intensity;
1013 if(intensity > 9.99) { intensity = 9.99; }
1014 int intI = int(100 * intensity + 0.0001);
1015 flowDataIntI.push_back(intI);
1017 m->gobble(flowFile);
1021 numSeqs = seqNameVector.size();
1023 for(int i=0;i<numSeqs;i++){
1025 if (m->control_pressed) { break; }
1027 int iNumFlowCells = i * numFlowCells;
1028 for(int j=lengths[i];j<numFlowCells;j++){
1029 flowDataIntI[iNumFlowCells + j] = 0;
1034 catch(exception& e) {
1035 m->errorOut(e, "ShhherCommand", "getFlowData");
1039 /**************************************************************************************************/
1040 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1043 vector<double> newTau(numOTUs,0);
1044 vector<double> norms(numSeqs, 0);
1049 for(int i=startSeq;i<stopSeq;i++){
1051 if (m->control_pressed) { break; }
1053 double offset = 1e8;
1054 int indexOffset = i * numOTUs;
1056 for(int j=0;j<numOTUs;j++){
1058 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1059 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1061 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1062 offset = dist[indexOffset + j];
1066 for(int j=0;j<numOTUs;j++){
1067 if(weight[j] > MIN_WEIGHT){
1068 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1069 norms[i] += newTau[j];
1076 for(int j=0;j<numOTUs;j++){
1078 newTau[j] /= norms[i];
1080 if(newTau[j] > MIN_TAU){
1081 otuIndex.push_back(j);
1082 seqIndex.push_back(i);
1083 singleTau.push_back(newTau[j]);
1089 catch(exception& e) {
1090 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1095 /**************************************************************************************************/
1097 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1102 vector<double> newTau(numOTUs,0);
1103 vector<double> norms(numSeqs, 0);
1104 nSeqsPerOTU.assign(numOTUs, 0);
1106 for(int i=startSeq;i<stopSeq;i++){
1108 if (m->control_pressed) { break; }
1110 int indexOffset = i * numOTUs;
1112 double offset = 1e8;
1114 for(int j=0;j<numOTUs;j++){
1116 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1117 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1120 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1121 offset = dist[indexOffset + j];
1125 for(int j=0;j<numOTUs;j++){
1126 if(weight[j] > MIN_WEIGHT){
1127 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1128 norms[i] += newTau[j];
1135 for(int j=0;j<numOTUs;j++){
1136 newTau[j] /= norms[i];
1139 for(int j=0;j<numOTUs;j++){
1140 if(newTau[j] > MIN_TAU){
1142 int oldTotal = total;
1146 singleTau.resize(total, 0);
1147 seqNumber.resize(total, 0);
1148 seqIndex.resize(total, 0);
1150 singleTau[oldTotal] = newTau[j];
1152 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1153 aaI[j][nSeqsPerOTU[j]] = i;
1161 catch(exception& e) {
1162 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1167 /**************************************************************************************************/
1169 void ShhherCommand::setOTUs(){
1172 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1174 for(int i=0;i<numOTUs;i++){
1176 if (m->control_pressed) { break; }
1178 for(int j=0;j<nSeqsPerOTU[i];j++){
1179 int index = cumNumSeqs[i] + j;
1180 double tauValue = singleTau[seqNumber[index]];
1181 int sIndex = seqIndex[index];
1182 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1186 for(int i=0;i<numSeqs;i++){
1187 double maxTau = -1.0000;
1189 for(int j=0;j<numOTUs;j++){
1190 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1191 maxTau = bigTauMatrix[i * numOTUs + j];
1196 otuData[i] = maxOTU;
1199 nSeqsPerOTU.assign(numOTUs, 0);
1201 for(int i=0;i<numSeqs;i++){
1202 int index = otuData[i];
1204 singleTau[i] = 1.0000;
1207 aaP[index][nSeqsPerOTU[index]] = i;
1208 aaI[index][nSeqsPerOTU[index]] = i;
1210 nSeqsPerOTU[index]++;
1214 catch(exception& e) {
1215 m->errorOut(e, "ShhherCommand", "setOTUs");
1220 /**************************************************************************************************/
1222 void ShhherCommand::getUniques(){
1227 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1228 uniqueCount.assign(numSeqs, 0); // anWeights
1229 uniqueLengths.assign(numSeqs, 0);
1230 mapSeqToUnique.assign(numSeqs, -1);
1231 mapUniqueToSeq.assign(numSeqs, -1);
1233 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1235 for(int i=0;i<numSeqs;i++){
1237 if (m->control_pressed) { break; }
1241 vector<short> current(numFlowCells);
1242 for(int j=0;j<numFlowCells;j++){
1243 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1246 for(int j=0;j<numUniques;j++){
1247 int offset = j * numFlowCells;
1251 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
1252 else { shorterLength = uniqueLengths[j]; }
1254 for(int k=0;k<shorterLength;k++){
1255 if(current[k] != uniqueFlowgrams[offset + k]){
1262 mapSeqToUnique[i] = j;
1265 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
1271 if(index == numUniques){
1272 uniqueLengths[numUniques] = lengths[i];
1273 uniqueCount[numUniques] = 1;
1274 mapSeqToUnique[i] = numUniques;//anMap
1275 mapUniqueToSeq[numUniques] = i;//anF
1277 for(int k=0;k<numFlowCells;k++){
1278 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1279 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1285 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1286 uniqueLengths.resize(numUniques);
1288 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1289 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
1291 catch(exception& e) {
1292 m->errorOut(e, "ShhherCommand", "getUniques");
1297 /**************************************************************************************************/
1299 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1301 int minLength = lengths[mapSeqToUnique[seqA]];
1302 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
1304 int ANumFlowCells = seqA * numFlowCells;
1305 int BNumFlowCells = seqB * numFlowCells;
1309 for(int i=0;i<minLength;i++){
1311 if (m->control_pressed) { break; }
1313 int flowAIntI = flowDataIntI[ANumFlowCells + i];
1314 float flowAPrI = flowDataPrI[ANumFlowCells + i];
1316 int flowBIntI = flowDataIntI[BNumFlowCells + i];
1317 float flowBPrI = flowDataPrI[BNumFlowCells + i];
1318 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1321 dist /= (float) minLength;
1324 catch(exception& e) {
1325 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1330 //**********************************************************************************************************************/
1332 string ShhherCommand::cluster(string distFileName, string namesFileName){
1335 ReadMatrix* read = new ReadColumnMatrix(distFileName);
1336 read->setCutoff(cutoff);
1338 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1339 clusterNameMap->readMap();
1340 read->read(clusterNameMap);
1342 ListVector* list = read->getListVector();
1343 SparseMatrix* matrix = read->getMatrix();
1346 delete clusterNameMap;
1348 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1350 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
1351 string tag = cluster->getTag();
1353 double clusterCutoff = cutoff;
1354 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1356 if (m->control_pressed) { break; }
1358 cluster->update(clusterCutoff);
1361 list->setLabel(toString(cutoff));
1363 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1365 m->openOutputFile(listFileName, listFile);
1366 list->print(listFile);
1369 delete matrix; delete cluster; delete rabund; delete list;
1371 return listFileName;
1373 catch(exception& e) {
1374 m->errorOut(e, "ShhherCommand", "cluster");
1379 /**************************************************************************************************/
1381 void ShhherCommand::calcCentroidsDriver(int start, int finish){
1383 //this function gets the most likely homopolymer length at a flow position for a group of sequences
1388 for(int i=start;i<finish;i++){
1390 if (m->control_pressed) { break; }
1394 int minFlowGram = 100000000;
1395 double minFlowValue = 1e8;
1396 change[i] = 0; //FALSE
1398 for(int j=0;j<nSeqsPerOTU[i];j++){
1399 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1402 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1403 vector<double> adF(nSeqsPerOTU[i]);
1404 vector<int> anL(nSeqsPerOTU[i]);
1406 for(int j=0;j<nSeqsPerOTU[i];j++){
1407 int index = cumNumSeqs[i] + j;
1408 int nI = seqIndex[index];
1409 int nIU = mapSeqToUnique[nI];
1412 for(k=0;k<position;k++){
1418 anL[position] = nIU;
1419 adF[position] = 0.0000;
1424 for(int j=0;j<nSeqsPerOTU[i];j++){
1425 int index = cumNumSeqs[i] + j;
1426 int nI = seqIndex[index];
1428 double tauValue = singleTau[seqNumber[index]];
1430 for(int k=0;k<position;k++){
1431 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1432 adF[k] += dist * tauValue;
1436 for(int j=0;j<position;j++){
1437 if(adF[j] < minFlowValue){
1439 minFlowValue = adF[j];
1443 if(centroids[i] != anL[minFlowGram]){
1445 centroids[i] = anL[minFlowGram];
1448 else if(centroids[i] != -1){
1454 catch(exception& e) {
1455 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1460 /**************************************************************************************************/
1462 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1465 int flowAValue = cent * numFlowCells;
1466 int flowBValue = flow * numFlowCells;
1470 for(int i=0;i<length;i++){
1471 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1476 return dist / (double)length;
1478 catch(exception& e) {
1479 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1484 /**************************************************************************************************/
1486 double ShhherCommand::getNewWeights(){
1489 double maxChange = 0;
1491 for(int i=0;i<numOTUs;i++){
1493 if (m->control_pressed) { break; }
1495 double difference = weight[i];
1498 for(int j=0;j<nSeqsPerOTU[i];j++){
1499 int index = cumNumSeqs[i] + j;
1500 double tauValue = singleTau[seqNumber[index]];
1501 weight[i] += tauValue;
1504 difference = fabs(weight[i] - difference);
1505 if(difference > maxChange){ maxChange = difference; }
1509 catch(exception& e) {
1510 m->errorOut(e, "ShhherCommand", "getNewWeights");
1515 /**************************************************************************************************/
1517 double ShhherCommand::getLikelihood(){
1521 vector<long double> P(numSeqs, 0);
1524 for(int i=0;i<numOTUs;i++){
1525 if(weight[i] > MIN_WEIGHT){
1531 for(int i=0;i<numOTUs;i++){
1533 if (m->control_pressed) { break; }
1535 for(int j=0;j<nSeqsPerOTU[i];j++){
1536 int index = cumNumSeqs[i] + j;
1537 int nI = seqIndex[index];
1538 double singleDist = dist[seqNumber[index]];
1540 P[nI] += weight[i] * exp(-singleDist * sigma);
1544 for(int i=0;i<numSeqs;i++){
1545 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1550 nLL = nLL -(double)numSeqs * log(sigma);
1554 catch(exception& e) {
1555 m->errorOut(e, "ShhherCommand", "getNewWeights");
1560 /**************************************************************************************************/
1562 void ShhherCommand::checkCentroids(){
1564 vector<int> unique(numOTUs, 1);
1566 for(int i=0;i<numOTUs;i++){
1567 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1572 for(int i=0;i<numOTUs;i++){
1574 if (m->control_pressed) { break; }
1577 for(int j=i+1;j<numOTUs;j++){
1580 if(centroids[j] == centroids[i]){
1584 weight[i] += weight[j];
1592 catch(exception& e) {
1593 m->errorOut(e, "ShhherCommand", "checkCentroids");
1597 /**************************************************************************************************/
1601 void ShhherCommand::writeQualities(vector<int> otuCounts){
1604 string thisOutputDir = outputDir;
1605 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1606 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1608 ofstream qualityFile;
1609 m->openOutputFile(qualityFileName, qualityFile);
1611 qualityFile.setf(ios::fixed, ios::floatfield);
1612 qualityFile.setf(ios::showpoint);
1613 qualityFile << setprecision(6);
1615 vector<vector<int> > qualities(numOTUs);
1616 vector<double> pr(HOMOPS, 0);
1619 for(int i=0;i<numOTUs;i++){
1621 if (m->control_pressed) { break; }
1626 if(nSeqsPerOTU[i] > 0){
1627 qualities[i].assign(1024, -1);
1629 while(index < numFlowCells){
1630 double maxPrValue = 1e8;
1631 short maxPrIndex = -1;
1632 double count = 0.0000;
1634 pr.assign(HOMOPS, 0);
1636 for(int j=0;j<nSeqsPerOTU[i];j++){
1637 int lIndex = cumNumSeqs[i] + j;
1638 double tauValue = singleTau[seqNumber[lIndex]];
1639 int sequenceIndex = aaI[i][j];
1640 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1644 for(int s=0;s<HOMOPS;s++){
1645 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1649 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1650 maxPrValue = pr[maxPrIndex];
1652 if(count > MIN_COUNT){
1654 double norm = 0.0000;
1656 for(int s=0;s<HOMOPS;s++){
1657 norm += exp(-(pr[s] - maxPrValue));
1660 for(int s=1;s<=maxPrIndex;s++){
1662 double temp = 0.0000;
1664 U += exp(-(pr[s-1]-maxPrValue))/norm;
1672 temp = floor(-10 * temp);
1673 value = (int)floor(temp);
1674 if(value > 100){ value = 100; }
1676 qualities[i][base] = (int)value;
1686 if(otuCounts[i] > 0){
1687 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1689 int j=4; //need to get past the first four bases
1690 while(qualities[i][j] != -1){
1691 qualityFile << qualities[i][j] << ' ';
1694 qualityFile << endl;
1697 qualityFile.close();
1698 outputNames.push_back(qualityFileName);
1701 catch(exception& e) {
1702 m->errorOut(e, "ShhherCommand", "writeQualities");
1707 /**************************************************************************************************/
1709 void ShhherCommand::writeSequences(vector<int> otuCounts){
1711 string thisOutputDir = outputDir;
1712 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1713 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1715 m->openOutputFile(fastaFileName, fastaFile);
1717 vector<string> names(numOTUs, "");
1719 for(int i=0;i<numOTUs;i++){
1721 if (m->control_pressed) { break; }
1723 int index = centroids[i];
1725 if(otuCounts[i] > 0){
1726 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1730 for(int j=0;j<numFlowCells;j++){
1732 char base = flowOrder[j % 4];
1733 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1738 fastaFile << newSeq.substr(4) << endl;
1743 outputNames.push_back(fastaFileName);
1745 if(compositeFASTAFileName != ""){
1746 m->appendFiles(fastaFileName, compositeFASTAFileName);
1749 catch(exception& e) {
1750 m->errorOut(e, "ShhherCommand", "writeSequences");
1755 /**************************************************************************************************/
1757 void ShhherCommand::writeNames(vector<int> otuCounts){
1759 string thisOutputDir = outputDir;
1760 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1761 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1763 m->openOutputFile(nameFileName, nameFile);
1765 for(int i=0;i<numOTUs;i++){
1767 if (m->control_pressed) { break; }
1769 if(otuCounts[i] > 0){
1770 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1772 for(int j=1;j<nSeqsPerOTU[i];j++){
1773 nameFile << ',' << seqNameVector[aaI[i][j]];
1780 outputNames.push_back(nameFileName);
1783 if(compositeNamesFileName != ""){
1784 m->appendFiles(nameFileName, compositeNamesFileName);
1787 catch(exception& e) {
1788 m->errorOut(e, "ShhherCommand", "writeNames");
1793 /**************************************************************************************************/
1795 void ShhherCommand::writeGroups(){
1797 string thisOutputDir = outputDir;
1798 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1799 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1800 string groupFileName = fileRoot + "shhh.groups";
1802 m->openOutputFile(groupFileName, groupFile);
1804 for(int i=0;i<numSeqs;i++){
1805 if (m->control_pressed) { break; }
1806 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1809 outputNames.push_back(groupFileName);
1812 catch(exception& e) {
1813 m->errorOut(e, "ShhherCommand", "writeGroups");
1818 /**************************************************************************************************/
1820 void ShhherCommand::writeClusters(vector<int> otuCounts){
1822 string thisOutputDir = outputDir;
1823 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1824 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1825 ofstream otuCountsFile;
1826 m->openOutputFile(otuCountsFileName, otuCountsFile);
1828 string bases = flowOrder;
1830 for(int i=0;i<numOTUs;i++){
1832 if (m->control_pressed) {
1835 //output the translated version of the centroid sequence for the otu
1836 if(otuCounts[i] > 0){
1837 int index = centroids[i];
1839 otuCountsFile << "ideal\t";
1840 for(int j=8;j<numFlowCells;j++){
1841 char base = bases[j % 4];
1842 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1843 otuCountsFile << base;
1846 otuCountsFile << endl;
1848 for(int j=0;j<nSeqsPerOTU[i];j++){
1849 int sequence = aaI[i][j];
1850 otuCountsFile << seqNameVector[sequence] << '\t';
1854 for(int k=0;k<lengths[sequence];k++){
1855 char base = bases[k % 4];
1856 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1858 for(int s=0;s<freq;s++){
1860 //otuCountsFile << base;
1863 otuCountsFile << newSeq.substr(4) << endl;
1865 otuCountsFile << endl;
1868 otuCountsFile.close();
1869 outputNames.push_back(otuCountsFileName);
1872 catch(exception& e) {
1873 m->errorOut(e, "ShhherCommand", "writeClusters");
1879 //**********************************************************************************************************************
1881 int ShhherCommand::execute(){
1883 if (abort == true) { return 0; }
1885 getSingleLookUp(); if (m->control_pressed) { return 0; }
1886 getJointLookUp(); if (m->control_pressed) { return 0; }
1888 int numFiles = flowFileVector.size();
1890 if (numFiles < processors) { processors = numFiles; }
1892 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1893 if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1894 else { createProcesses(flowFileVector); } //each processor processes one file
1896 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1899 if(compositeFASTAFileName != ""){
1900 outputNames.push_back(compositeFASTAFileName);
1901 outputNames.push_back(compositeNamesFileName);
1904 m->mothurOutEndLine();
1905 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1906 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
1907 m->mothurOutEndLine();
1911 catch(exception& e) {
1912 m->errorOut(e, "ShhherCommand", "execute");
1917 /**************************************************************************************************/
1919 int ShhherCommand::createProcesses(vector<string> filenames){
1921 vector<int> processIDS;
1926 if (filenames.size() < processors) { processors = filenames.size(); }
1928 //divide the groups between the processors
1929 vector<linePair> lines;
1930 int numFilesPerProcessor = filenames.size() / processors;
1931 for (int i = 0; i < processors; i++) {
1932 int startIndex = i * numFilesPerProcessor;
1933 int endIndex = (i+1) * numFilesPerProcessor;
1934 if(i == (processors - 1)){ endIndex = filenames.size(); }
1935 lines.push_back(linePair(startIndex, endIndex));
1938 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1940 //loop through and create all the processes you want
1941 while (process != processors) {
1945 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1947 }else if (pid == 0){
1948 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1951 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1952 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1958 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1960 //force parent to wait until all the processes are done
1961 for (int i=0;i<processIDS.size();i++) {
1962 int temp = processIDS[i];
1968 //////////////////////////////////////////////////////////////////////////////////////////////////////
1970 /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1972 //////////////////////////////////////////////////////////////////////////////////////////////////////
1973 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
1974 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1975 //////////////////////////////////////////////////////////////////////////////////////////////////////
1977 vector<shhhFlowsData*> pDataArray;
1978 DWORD dwThreadIdArray[processors-1];
1979 HANDLE hThreadArray[processors-1];
1981 //Create processor worker threads.
1982 for( int i=0; i<processors-1; i++ ){
1983 // Allocate memory for thread data.
1984 string extension = "";
1985 if (i != 0) { extension = toString(i) + ".temp"; }
1987 shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1988 pDataArray.push_back(tempFlow);
1989 processIDS.push_back(i);
1991 hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1994 //using the main process as a worker saves time and memory
1996 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1998 //Wait until all threads have terminated.
1999 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2001 //Close all thread handles and free memory allocations.
2002 for(int i=0; i < pDataArray.size(); i++){
2003 for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2004 CloseHandle(hThreadArray[i]);
2005 delete pDataArray[i];
2010 for (int i=0;i<processIDS.size();i++) {
2011 if (compositeFASTAFileName != "") {
2012 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2013 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2014 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2015 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2022 catch(exception& e) {
2023 m->errorOut(e, "ShhherCommand", "createProcesses");
2027 /**************************************************************************************************/
2029 vector<string> ShhherCommand::parseFlowFiles(string filename){
2031 vector<string> files;
2035 m->openInputFile(filename, in);
2037 int thisNumFLows = 0;
2038 in >> thisNumFLows; m->gobble(in);
2041 if (m->control_pressed) { break; }
2044 string outputFileName = filename + toString(count) + ".temp";
2045 m->openOutputFile(outputFileName, out);
2046 out << thisNumFLows << endl;
2047 files.push_back(outputFileName);
2049 int numLinesWrote = 0;
2050 for (int i = 0; i < largeSize; i++) {
2051 if (in.eof()) { break; }
2052 string line = m->getline(in); m->gobble(in);
2053 out << line << endl;
2058 if (numLinesWrote == 0) { m->mothurRemove(outputFileName); files.pop_back(); }
2063 if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); } files.clear(); }
2065 m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n");
2069 catch(exception& e) {
2070 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2074 /**************************************************************************************************/
2076 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2079 for(int i=start;i<end;i++){
2081 if (m->control_pressed) { break; }
2083 vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2084 if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
2086 if (m->control_pressed) { break; }
2088 double begClock = clock();
2089 unsigned long long begTime;
2091 for (int g = 0; g < theseFlowFileNames.size(); g++) {
2093 string flowFileName = theseFlowFileNames[g];
2094 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2095 m->mothurOut("Reading flowgrams...\n");
2097 vector<string> seqNameVector;
2098 vector<int> lengths;
2099 vector<short> flowDataIntI;
2100 vector<double> flowDataPrI;
2101 map<string, int> nameMap;
2102 vector<short> uniqueFlowgrams;
2103 vector<int> uniqueCount;
2104 vector<int> mapSeqToUnique;
2105 vector<int> mapUniqueToSeq;
2106 vector<int> uniqueLengths;
2109 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2111 if (m->control_pressed) { break; }
2113 m->mothurOut("Identifying unique flowgrams...\n");
2114 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2116 if (m->control_pressed) { break; }
2118 m->mothurOut("Calculating distances between flowgrams...\n");
2119 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2120 begTime = time(NULL);
2123 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2125 m->mothurOutEndLine();
2126 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2129 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2130 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2132 if (m->control_pressed) { break; }
2134 m->mothurOut("\nClustering flowgrams...\n");
2135 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2136 cluster(listFileName, distFileName, namesFileName);
2138 if (m->control_pressed) { break; }
2140 vector<int> otuData;
2141 vector<int> cumNumSeqs;
2142 vector<int> nSeqsPerOTU;
2143 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2144 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2145 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
2146 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
2149 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2151 if (m->control_pressed) { break; }
2153 m->mothurRemove(distFileName);
2154 m->mothurRemove(namesFileName);
2155 m->mothurRemove(listFileName);
2157 vector<double> dist; //adDist - distance of sequences to centroids
2158 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
2159 vector<int> centroids; //the representative flowgram for each cluster m
2160 vector<double> weight;
2161 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2162 vector<int> nSeqsBreaks;
2163 vector<int> nOTUsBreaks;
2165 dist.assign(numSeqs * numOTUs, 0);
2166 change.assign(numOTUs, 1);
2167 centroids.assign(numOTUs, -1);
2168 weight.assign(numOTUs, 0);
2169 singleTau.assign(numSeqs, 1.0);
2171 nSeqsBreaks.assign(2, 0);
2172 nOTUsBreaks.assign(2, 0);
2175 nSeqsBreaks[1] = numSeqs;
2176 nOTUsBreaks[1] = numOTUs;
2178 if (m->control_pressed) { break; }
2180 double maxDelta = 0;
2184 begTime = time(NULL);
2186 m->mothurOut("\nDenoising flowgrams...\n");
2187 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2189 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2191 if (m->control_pressed) { break; }
2193 double cycClock = clock();
2194 unsigned long long cycTime = time(NULL);
2195 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2197 if (m->control_pressed) { break; }
2199 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2201 if (m->control_pressed) { break; }
2203 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
2205 if (m->control_pressed) { break; }
2207 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
2209 if (m->control_pressed) { break; }
2211 checkCentroids(numOTUs, centroids, weight);
2213 if (m->control_pressed) { break; }
2215 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2217 if (m->control_pressed) { break; }
2221 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2225 if (m->control_pressed) { break; }
2227 m->mothurOut("\nFinalizing...\n");
2228 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2230 if (m->control_pressed) { break; }
2232 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2234 if (m->control_pressed) { break; }
2236 vector<int> otuCounts(numOTUs, 0);
2237 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
2239 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2241 if (m->control_pressed) { break; }
2243 if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2244 string thisOutputDir = outputDir;
2245 if (outputDir == "") { thisOutputDir = m->hasPath(flowFileName); }
2246 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2247 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2248 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2249 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2250 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2251 string groupFileName = fileRoot + "shhh.groups";
2254 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2255 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2256 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
2257 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
2258 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector); if (m->control_pressed) { break; }
2262 m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2263 m->mothurRemove(qualityFileName);
2264 m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2265 m->mothurRemove(fastaFileName);
2266 m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2267 m->mothurRemove(nameFileName);
2268 m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2269 m->mothurRemove(otuCountsFileName);
2270 m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2271 m->mothurRemove(groupFileName);
2273 m->mothurRemove(theseFlowFileNames[g]);
2277 m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2280 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2284 }catch(exception& e) {
2285 m->errorOut(e, "ShhherCommand", "driver");
2290 /**************************************************************************************************/
2291 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2296 m->openInputFile(filename, flowFile);
2299 int currentNumFlowCells;
2301 thisSeqNameVector.clear();
2302 thisLengths.clear();
2303 thisFlowDataIntI.clear();
2304 thisNameMap.clear();
2306 flowFile >> numFlowCells;
2307 int index = 0;//pcluster
2308 while(!flowFile.eof()){
2310 if (m->control_pressed) { break; }
2312 flowFile >> seqName >> currentNumFlowCells;
2313 thisLengths.push_back(currentNumFlowCells);
2315 thisSeqNameVector.push_back(seqName);
2316 thisNameMap[seqName] = index++;//pcluster
2318 for(int i=0;i<numFlowCells;i++){
2319 flowFile >> intensity;
2320 if(intensity > 9.99) { intensity = 9.99; }
2321 int intI = int(100 * intensity + 0.0001);
2322 thisFlowDataIntI.push_back(intI);
2324 m->gobble(flowFile);
2328 int numSeqs = thisSeqNameVector.size();
2330 for(int i=0;i<numSeqs;i++){
2332 if (m->control_pressed) { break; }
2334 int iNumFlowCells = i * numFlowCells;
2335 for(int j=thisLengths[i];j<numFlowCells;j++){
2336 thisFlowDataIntI[iNumFlowCells + j] = 0;
2343 catch(exception& e) {
2344 m->errorOut(e, "ShhherCommand", "getFlowData");
2348 /**************************************************************************************************/
2350 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2353 ostringstream outStream;
2354 outStream.setf(ios::fixed, ios::floatfield);
2355 outStream.setf(ios::dec, ios::basefield);
2356 outStream.setf(ios::showpoint);
2357 outStream.precision(6);
2359 int begTime = time(NULL);
2360 double begClock = clock();
2362 for(int i=0;i<stopSeq;i++){
2364 if (m->control_pressed) { break; }
2366 for(int j=0;j<i;j++){
2367 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2369 if(flowDistance < 1e-6){
2370 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2372 else if(flowDistance <= cutoff){
2373 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2377 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2378 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2379 m->mothurOutEndLine();
2383 ofstream distFile(distFileName.c_str());
2384 distFile << outStream.str();
2387 if (m->control_pressed) {}
2389 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2390 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2391 m->mothurOutEndLine();
2396 catch(exception& e) {
2397 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2401 /**************************************************************************************************/
2403 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2405 int minLength = lengths[mapSeqToUnique[seqA]];
2406 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
2408 int ANumFlowCells = seqA * numFlowCells;
2409 int BNumFlowCells = seqB * numFlowCells;
2413 for(int i=0;i<minLength;i++){
2415 if (m->control_pressed) { break; }
2417 int flowAIntI = flowDataIntI[ANumFlowCells + i];
2418 float flowAPrI = flowDataPrI[ANumFlowCells + i];
2420 int flowBIntI = flowDataIntI[BNumFlowCells + i];
2421 float flowBPrI = flowDataPrI[BNumFlowCells + i];
2422 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2425 dist /= (float) minLength;
2428 catch(exception& e) {
2429 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2434 /**************************************************************************************************/
2436 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2439 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2440 uniqueCount.assign(numSeqs, 0); // anWeights
2441 uniqueLengths.assign(numSeqs, 0);
2442 mapSeqToUnique.assign(numSeqs, -1);
2443 mapUniqueToSeq.assign(numSeqs, -1);
2445 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2447 for(int i=0;i<numSeqs;i++){
2449 if (m->control_pressed) { break; }
2453 vector<short> current(numFlowCells);
2454 for(int j=0;j<numFlowCells;j++){
2455 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2458 for(int j=0;j<numUniques;j++){
2459 int offset = j * numFlowCells;
2463 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
2464 else { shorterLength = uniqueLengths[j]; }
2466 for(int k=0;k<shorterLength;k++){
2467 if(current[k] != uniqueFlowgrams[offset + k]){
2474 mapSeqToUnique[i] = j;
2477 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
2483 if(index == numUniques){
2484 uniqueLengths[numUniques] = lengths[i];
2485 uniqueCount[numUniques] = 1;
2486 mapSeqToUnique[i] = numUniques;//anMap
2487 mapUniqueToSeq[numUniques] = i;//anF
2489 for(int k=0;k<numFlowCells;k++){
2490 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2491 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2497 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2498 uniqueLengths.resize(numUniques);
2500 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2501 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
2505 catch(exception& e) {
2506 m->errorOut(e, "ShhherCommand", "getUniques");
2510 /**************************************************************************************************/
2511 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2514 vector<string> duplicateNames(numUniques, "");
2515 for(int i=0;i<numSeqs;i++){
2516 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2520 m->openOutputFile(filename, nameFile);
2522 for(int i=0;i<numUniques;i++){
2524 if (m->control_pressed) { break; }
2526 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2527 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2534 catch(exception& e) {
2535 m->errorOut(e, "ShhherCommand", "createNamesFile");
2539 //**********************************************************************************************************************
2541 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2544 ReadMatrix* read = new ReadColumnMatrix(distFileName);
2545 read->setCutoff(cutoff);
2547 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2548 clusterNameMap->readMap();
2549 read->read(clusterNameMap);
2551 ListVector* list = read->getListVector();
2552 SparseMatrix* matrix = read->getMatrix();
2555 delete clusterNameMap;
2557 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2559 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
2560 string tag = cluster->getTag();
2562 double clusterCutoff = cutoff;
2563 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2565 if (m->control_pressed) { break; }
2567 cluster->update(clusterCutoff);
2570 list->setLabel(toString(cutoff));
2573 m->openOutputFile(filename, listFile);
2574 list->print(listFile);
2577 delete matrix; delete cluster; delete rabund; delete list;
2581 catch(exception& e) {
2582 m->errorOut(e, "ShhherCommand", "cluster");
2586 /**************************************************************************************************/
2588 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
2589 vector<int>& cumNumSeqs,
2590 vector<int>& nSeqsPerOTU,
2591 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2592 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2593 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
2594 vector<int>& seqIndex,
2595 map<string, int>& nameMap){
2599 m->openInputFile(fileName, listFile);
2603 listFile >> label >> numOTUs;
2605 otuData.assign(numSeqs, 0);
2606 cumNumSeqs.assign(numOTUs, 0);
2607 nSeqsPerOTU.assign(numOTUs, 0);
2608 aaP.clear();aaP.resize(numOTUs);
2614 string singleOTU = "";
2616 for(int i=0;i<numOTUs;i++){
2618 if (m->control_pressed) { break; }
2620 listFile >> singleOTU;
2622 istringstream otuString(singleOTU);
2626 string seqName = "";
2628 for(int j=0;j<singleOTU.length();j++){
2629 char letter = otuString.get();
2635 map<string,int>::iterator nmIt = nameMap.find(seqName);
2636 int index = nmIt->second;
2638 nameMap.erase(nmIt);
2642 aaP[i].push_back(index);
2647 map<string,int>::iterator nmIt = nameMap.find(seqName);
2649 int index = nmIt->second;
2650 nameMap.erase(nmIt);
2654 aaP[i].push_back(index);
2659 sort(aaP[i].begin(), aaP[i].end());
2660 for(int j=0;j<nSeqsPerOTU[i];j++){
2661 seqNumber.push_back(aaP[i][j]);
2663 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2664 aaP[i].push_back(0);
2670 for(int i=1;i<numOTUs;i++){
2671 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2674 seqIndex = seqNumber;
2681 catch(exception& e) {
2682 m->errorOut(e, "ShhherCommand", "getOTUData");
2686 /**************************************************************************************************/
2688 int ShhherCommand::calcCentroidsDriver(int numOTUs,
2689 vector<int>& cumNumSeqs,
2690 vector<int>& nSeqsPerOTU,
2691 vector<int>& seqIndex,
2692 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
2693 vector<int>& centroids, //the representative flowgram for each cluster m
2694 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2695 vector<int>& mapSeqToUnique,
2696 vector<short>& uniqueFlowgrams,
2697 vector<short>& flowDataIntI,
2698 vector<int>& lengths,
2700 vector<int>& seqNumber){
2702 //this function gets the most likely homopolymer length at a flow position for a group of sequences
2707 for(int i=0;i<numOTUs;i++){
2709 if (m->control_pressed) { break; }
2713 int minFlowGram = 100000000;
2714 double minFlowValue = 1e8;
2715 change[i] = 0; //FALSE
2717 for(int j=0;j<nSeqsPerOTU[i];j++){
2718 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2721 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2722 vector<double> adF(nSeqsPerOTU[i]);
2723 vector<int> anL(nSeqsPerOTU[i]);
2725 for(int j=0;j<nSeqsPerOTU[i];j++){
2726 int index = cumNumSeqs[i] + j;
2727 int nI = seqIndex[index];
2728 int nIU = mapSeqToUnique[nI];
2731 for(k=0;k<position;k++){
2737 anL[position] = nIU;
2738 adF[position] = 0.0000;
2743 for(int j=0;j<nSeqsPerOTU[i];j++){
2744 int index = cumNumSeqs[i] + j;
2745 int nI = seqIndex[index];
2747 double tauValue = singleTau[seqNumber[index]];
2749 for(int k=0;k<position;k++){
2750 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2751 adF[k] += dist * tauValue;
2755 for(int j=0;j<position;j++){
2756 if(adF[j] < minFlowValue){
2758 minFlowValue = adF[j];
2762 if(centroids[i] != anL[minFlowGram]){
2764 centroids[i] = anL[minFlowGram];
2767 else if(centroids[i] != -1){
2775 catch(exception& e) {
2776 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2780 /**************************************************************************************************/
2782 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2783 vector<short>& flowDataIntI, int numFlowCells){
2786 int flowAValue = cent * numFlowCells;
2787 int flowBValue = flow * numFlowCells;
2791 for(int i=0;i<length;i++){
2792 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2797 return dist / (double)length;
2799 catch(exception& e) {
2800 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2804 /**************************************************************************************************/
2806 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2809 double maxChange = 0;
2811 for(int i=0;i<numOTUs;i++){
2813 if (m->control_pressed) { break; }
2815 double difference = weight[i];
2818 for(int j=0;j<nSeqsPerOTU[i];j++){
2819 int index = cumNumSeqs[i] + j;
2820 double tauValue = singleTau[seqNumber[index]];
2821 weight[i] += tauValue;
2824 difference = fabs(weight[i] - difference);
2825 if(difference > maxChange){ maxChange = difference; }
2829 catch(exception& e) {
2830 m->errorOut(e, "ShhherCommand", "getNewWeights");
2835 /**************************************************************************************************/
2837 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2841 vector<long double> P(numSeqs, 0);
2844 for(int i=0;i<numOTUs;i++){
2845 if(weight[i] > MIN_WEIGHT){
2851 for(int i=0;i<numOTUs;i++){
2853 if (m->control_pressed) { break; }
2855 for(int j=0;j<nSeqsPerOTU[i];j++){
2856 int index = cumNumSeqs[i] + j;
2857 int nI = seqIndex[index];
2858 double singleDist = dist[seqNumber[index]];
2860 P[nI] += weight[i] * exp(-singleDist * sigma);
2864 for(int i=0;i<numSeqs;i++){
2865 if(P[i] == 0){ P[i] = DBL_EPSILON; }
2870 nLL = nLL -(double)numSeqs * log(sigma);
2874 catch(exception& e) {
2875 m->errorOut(e, "ShhherCommand", "getNewWeights");
2880 /**************************************************************************************************/
2882 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2884 vector<int> unique(numOTUs, 1);
2886 for(int i=0;i<numOTUs;i++){
2887 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2892 for(int i=0;i<numOTUs;i++){
2894 if (m->control_pressed) { break; }
2897 for(int j=i+1;j<numOTUs;j++){
2900 if(centroids[j] == centroids[i]){
2904 weight[i] += weight[j];
2914 catch(exception& e) {
2915 m->errorOut(e, "ShhherCommand", "checkCentroids");
2919 /**************************************************************************************************/
2921 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
2922 vector<double>& weight, vector<short>& change, vector<int>& centroids,
2923 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
2924 vector<int>& seqNumber, vector<int>& seqIndex,
2925 vector<short>& uniqueFlowgrams,
2926 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2931 vector<double> newTau(numOTUs,0);
2932 vector<double> norms(numSeqs, 0);
2933 nSeqsPerOTU.assign(numOTUs, 0);
2935 for(int i=0;i<numSeqs;i++){
2937 if (m->control_pressed) { break; }
2939 int indexOffset = i * numOTUs;
2941 double offset = 1e8;
2943 for(int j=0;j<numOTUs;j++){
2945 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2946 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2949 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2950 offset = dist[indexOffset + j];
2954 for(int j=0;j<numOTUs;j++){
2955 if(weight[j] > MIN_WEIGHT){
2956 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2957 norms[i] += newTau[j];
2964 for(int j=0;j<numOTUs;j++){
2965 newTau[j] /= norms[i];
2968 for(int j=0;j<numOTUs;j++){
2969 if(newTau[j] > MIN_TAU){
2971 int oldTotal = total;
2975 singleTau.resize(total, 0);
2976 seqNumber.resize(total, 0);
2977 seqIndex.resize(total, 0);
2979 singleTau[oldTotal] = newTau[j];
2981 aaP[j][nSeqsPerOTU[j]] = oldTotal;
2982 aaI[j][nSeqsPerOTU[j]] = i;
2990 catch(exception& e) {
2991 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2995 /**************************************************************************************************/
2997 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3000 for(int i=0;i<numOTUs;i++){
3002 if (m->control_pressed) { return 0; }
3004 cumNumSeqs[i] = index;
3005 for(int j=0;j<nSeqsPerOTU[i];j++){
3006 seqNumber[index] = aaP[i][j];
3007 seqIndex[index] = aaI[i][j];
3015 catch(exception& e) {
3016 m->errorOut(e, "ShhherCommand", "fill");
3020 /**************************************************************************************************/
3022 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3023 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3026 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3028 for(int i=0;i<numOTUs;i++){
3030 if (m->control_pressed) { break; }
3032 for(int j=0;j<nSeqsPerOTU[i];j++){
3033 int index = cumNumSeqs[i] + j;
3034 double tauValue = singleTau[seqNumber[index]];
3035 int sIndex = seqIndex[index];
3036 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
3040 for(int i=0;i<numSeqs;i++){
3041 double maxTau = -1.0000;
3043 for(int j=0;j<numOTUs;j++){
3044 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3045 maxTau = bigTauMatrix[i * numOTUs + j];
3050 otuData[i] = maxOTU;
3053 nSeqsPerOTU.assign(numOTUs, 0);
3055 for(int i=0;i<numSeqs;i++){
3056 int index = otuData[i];
3058 singleTau[i] = 1.0000;
3061 aaP[index][nSeqsPerOTU[index]] = i;
3062 aaI[index][nSeqsPerOTU[index]] = i;
3064 nSeqsPerOTU[index]++;
3067 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
3069 catch(exception& e) {
3070 m->errorOut(e, "ShhherCommand", "setOTUs");
3074 /**************************************************************************************************/
3076 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3077 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3078 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3082 ofstream qualityFile;
3083 m->openOutputFile(qualityFileName, qualityFile);
3085 qualityFile.setf(ios::fixed, ios::floatfield);
3086 qualityFile.setf(ios::showpoint);
3087 qualityFile << setprecision(6);
3089 vector<vector<int> > qualities(numOTUs);
3090 vector<double> pr(HOMOPS, 0);
3093 for(int i=0;i<numOTUs;i++){
3095 if (m->control_pressed) { break; }
3100 if(nSeqsPerOTU[i] > 0){
3101 qualities[i].assign(1024, -1);
3103 while(index < numFlowCells){
3104 double maxPrValue = 1e8;
3105 short maxPrIndex = -1;
3106 double count = 0.0000;
3108 pr.assign(HOMOPS, 0);
3110 for(int j=0;j<nSeqsPerOTU[i];j++){
3111 int lIndex = cumNumSeqs[i] + j;
3112 double tauValue = singleTau[seqNumber[lIndex]];
3113 int sequenceIndex = aaI[i][j];
3114 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3118 for(int s=0;s<HOMOPS;s++){
3119 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3123 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3124 maxPrValue = pr[maxPrIndex];
3126 if(count > MIN_COUNT){
3128 double norm = 0.0000;
3130 for(int s=0;s<HOMOPS;s++){
3131 norm += exp(-(pr[s] - maxPrValue));
3134 for(int s=1;s<=maxPrIndex;s++){
3136 double temp = 0.0000;
3138 U += exp(-(pr[s-1]-maxPrValue))/norm;
3146 temp = floor(-10 * temp);
3147 value = (int)floor(temp);
3148 if(value > 100){ value = 100; }
3150 qualities[i][base] = (int)value;
3160 if(otuCounts[i] > 0){
3161 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3163 int j=4; //need to get past the first four bases
3164 while(qualities[i][j] != -1){
3165 qualityFile << qualities[i][j] << ' ';
3166 if (j > qualities[i].size()) { break; }
3169 qualityFile << endl;
3172 qualityFile.close();
3173 outputNames.push_back(qualityFileName);
3176 catch(exception& e) {
3177 m->errorOut(e, "ShhherCommand", "writeQualities");
3182 /**************************************************************************************************/
3184 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3188 m->openOutputFile(fastaFileName, fastaFile);
3190 vector<string> names(numOTUs, "");
3192 for(int i=0;i<numOTUs;i++){
3194 if (m->control_pressed) { break; }
3196 int index = centroids[i];
3198 if(otuCounts[i] > 0){
3199 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3203 for(int j=0;j<numFlowCells;j++){
3205 char base = flowOrder[j % 4];
3206 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3211 if (newSeq.length() >= 4) { fastaFile << newSeq.substr(4) << endl; }
3212 else { fastaFile << "NNNN" << endl; }
3217 outputNames.push_back(fastaFileName);
3219 if(thisCompositeFASTAFileName != ""){
3220 m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3223 catch(exception& e) {
3224 m->errorOut(e, "ShhherCommand", "writeSequences");
3229 /**************************************************************************************************/
3231 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3235 m->openOutputFile(nameFileName, nameFile);
3237 for(int i=0;i<numOTUs;i++){
3239 if (m->control_pressed) { break; }
3241 if(otuCounts[i] > 0){
3242 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3244 for(int j=1;j<nSeqsPerOTU[i];j++){
3245 nameFile << ',' << seqNameVector[aaI[i][j]];
3252 outputNames.push_back(nameFileName);
3255 if(thisCompositeNamesFileName != ""){
3256 m->appendFiles(nameFileName, thisCompositeNamesFileName);
3259 catch(exception& e) {
3260 m->errorOut(e, "ShhherCommand", "writeNames");
3265 /**************************************************************************************************/
3267 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3270 m->openOutputFile(groupFileName, groupFile);
3272 for(int i=0;i<numSeqs;i++){
3273 if (m->control_pressed) { break; }
3274 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3277 outputNames.push_back(groupFileName);
3280 catch(exception& e) {
3281 m->errorOut(e, "ShhherCommand", "writeGroups");
3286 /**************************************************************************************************/
3288 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3290 ofstream otuCountsFile;
3291 m->openOutputFile(otuCountsFileName, otuCountsFile);
3293 string bases = flowOrder;
3295 for(int i=0;i<numOTUs;i++){
3297 if (m->control_pressed) {
3300 //output the translated version of the centroid sequence for the otu
3301 if(otuCounts[i] > 0){
3302 int index = centroids[i];
3304 otuCountsFile << "ideal\t";
3305 for(int j=8;j<numFlowCells;j++){
3306 char base = bases[j % 4];
3307 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3308 otuCountsFile << base;
3311 otuCountsFile << endl;
3313 for(int j=0;j<nSeqsPerOTU[i];j++){
3314 int sequence = aaI[i][j];
3315 otuCountsFile << seqNameVector[sequence] << '\t';
3319 for(int k=0;k<lengths[sequence];k++){
3320 char base = bases[k % 4];
3321 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3323 for(int s=0;s<freq;s++){
3325 //otuCountsFile << base;
3329 if (newSeq.length() >= 4) { otuCountsFile << newSeq.substr(4) << endl; }
3330 else { otuCountsFile << "NNNN" << endl; }
3332 otuCountsFile << endl;
3335 otuCountsFile.close();
3336 outputNames.push_back(otuCountsFileName);
3339 catch(exception& e) {
3340 m->errorOut(e, "ShhherCommand", "writeClusters");
3345 /**************************************************************************************************/
3347 void ShhherCommand::getSingleLookUp(){
3349 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
3350 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3353 ifstream lookUpFile;
3354 m->openInputFile(lookupFileName, lookUpFile);
3356 for(int i=0;i<HOMOPS;i++){
3358 if (m->control_pressed) { break; }
3361 lookUpFile >> logFracFreq;
3363 for(int j=0;j<NUMBINS;j++) {
3364 lookUpFile >> singleLookUp[index];
3370 catch(exception& e) {
3371 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3376 /**************************************************************************************************/
3378 void ShhherCommand::getJointLookUp(){
3381 // the most likely joint probability (-log) that two intenities have the same polymer length
3382 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3384 for(int i=0;i<NUMBINS;i++){
3386 if (m->control_pressed) { break; }
3388 for(int j=0;j<NUMBINS;j++){
3390 double minSum = 100000000;
3392 for(int k=0;k<HOMOPS;k++){
3393 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3395 if(sum < minSum) { minSum = sum; }
3397 jointLookUp[i * NUMBINS + j] = minSum;
3401 catch(exception& e) {
3402 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3407 /**************************************************************************************************/
3409 double ShhherCommand::getProbIntensity(int intIntensity){
3412 double minNegLogProb = 100000000;
3415 for(int i=0;i<HOMOPS;i++){//loop signal strength
3417 if (m->control_pressed) { break; }
3419 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3420 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
3423 return minNegLogProb;
3425 catch(exception& e) {
3426 m->errorOut(e, "ShhherCommand", "getProbIntensity");