5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "shhhercommand.h"
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "ShhherCommand", "setParameters");
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
44 m->errorOut(e, "ShhherCommand", "getHelpString");
48 //**********************************************************************************************************************
50 ShhherCommand::ShhherCommand(){
52 abort = true; calledHelp = true;
55 //initialize outputTypes
56 // vector<string> tempOutNames;
57 // outputTypes["pn.dist"] = tempOutNames;
61 m->errorOut(e, "ShhherCommand", "ShhherCommand");
66 //**********************************************************************************************************************
68 ShhherCommand::ShhherCommand(string option) {
70 abort = false; calledHelp = false;
72 //allow user to run help
73 if(option == "help") { help(); abort = true; calledHelp = true; }
74 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
77 vector<string> myArray = setParameters();
79 OptionParser parser(option);
80 map<string,string> parameters = parser.getParameters();
82 ValidParameters validParameter;
83 map<string,string>::iterator it;
85 //check to make sure all parameters are valid for command
86 for (it = parameters.begin(); it != parameters.end(); it++) {
87 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
90 //initialize outputTypes
91 vector<string> tempOutNames;
92 // outputTypes["pn.dist"] = tempOutNames;
93 // outputTypes["fasta"] = tempOutNames;
95 //if the user changes the input directory command factory will send this info to us in the output parameter
96 string inputDir = validParameter.validFile(parameters, "inputdir", false);
97 if (inputDir == "not found"){ inputDir = ""; }
100 it = parameters.find("flow");
101 //user has given a template file
102 if(it != parameters.end()){
103 path = m->hasPath(it->second);
104 //if the user has not given a path then, add inputdir. else leave path alone.
105 if (path == "") { parameters["flow"] = inputDir + it->second; }
108 it = parameters.find("lookup");
109 //user has given a template file
110 if(it != parameters.end()){
111 path = m->hasPath(it->second);
112 //if the user has not given a path then, add inputdir. else leave path alone.
113 if (path == "") { parameters["lookup"] = inputDir + it->second; }
116 it = parameters.find("file");
117 //user has given a template file
118 if(it != parameters.end()){
119 path = m->hasPath(it->second);
120 //if the user has not given a path then, add inputdir. else leave path alone.
121 if (path == "") { parameters["file"] = inputDir + it->second; }
126 //check for required parameters
127 flowFileName = validParameter.validFile(parameters, "flow", true);
128 flowFilesFileName = validParameter.validFile(parameters, "file", true);
129 if (flowFileName == "not found" && flowFilesFileName == "not found") {
130 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
131 m->mothurOutEndLine();
134 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
136 if(flowFileName != "not found"){
137 compositeFASTAFileName = "";
138 compositeNamesFileName = "";
143 //flow.files = 9 character offset
144 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
145 m->openOutputFile(compositeFASTAFileName, temp);
148 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
149 m->openOutputFile(compositeNamesFileName, temp);
153 if(flowFilesFileName != "not found"){
156 ifstream flowFilesFile;
157 m->openInputFile(flowFilesFileName, flowFilesFile);
158 while(flowFilesFile){
159 fName = m->getline(flowFilesFile);
161 //test if file is valid
163 int ableToOpen = m->openInputFile(fName, in, "noerror");
165 if (ableToOpen == 1) {
166 if (inputDir != "") { //default path is set
167 string tryPath = inputDir + fName;
168 m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
170 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
176 if (ableToOpen == 1) {
177 if (m->getDefaultPath() != "") { //default path is set
178 string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
179 m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
181 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
187 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
188 if (ableToOpen == 1) {
189 string exepath = m->argv;
190 string tempPath = exepath;
191 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
192 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
194 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
195 m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
197 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
202 if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
203 else { flowFileVector.push_back(fName); }
204 m->gobble(flowFilesFile);
206 flowFilesFile.close();
207 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
210 flowFileVector.push_back(flowFileName);
214 //if the user changes the output directory command factory will send this info to us in the output parameter
215 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
217 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it
221 //check for optional parameter and set defaults
222 // ...at some point should added some additional type checking...
224 temp = validParameter.validFile(parameters, "lookup", true);
225 if (temp == "not found") {
226 lookupFileName = "LookUp_Titanium.pat";
230 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
233 //if you can't open it, try input location
234 if (ableToOpen == 1) {
235 if (inputDir != "") { //default path is set
236 string tryPath = inputDir + lookupFileName;
237 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
239 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
241 lookupFileName = tryPath;
245 //if you can't open it, try default location
246 if (ableToOpen == 1) {
247 if (m->getDefaultPath() != "") { //default path is set
248 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
249 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
251 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
253 lookupFileName = tryPath;
257 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
258 if (ableToOpen == 1) {
259 string exepath = m->argv;
260 string tempPath = exepath;
261 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
262 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
264 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
265 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
267 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
269 lookupFileName = tryPath;
272 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
274 else if(temp == "not open") {
276 lookupFileName = validParameter.validFile(parameters, "lookup", false);
278 //if you can't open it its not inputDir, try mothur excutable location
279 string exepath = m->argv;
280 string tempPath = exepath;
281 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
282 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
284 string tryPath = m->getFullPathName(exepath) + lookupFileName;
285 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
287 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
289 lookupFileName = tryPath;
291 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
292 }else { lookupFileName = temp; }
294 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
295 m->setProcessors(temp);
296 m->mothurConvert(temp, processors);
298 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
299 m->mothurConvert(temp, cutoff);
301 temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
302 m->mothurConvert(temp, minDelta);
304 temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
305 m->mothurConvert(temp, maxIters);
307 temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
308 m->mothurConvert(temp, sigma);
310 flowOrder = validParameter.validFile(parameters, "order", false);
311 if (flowOrder == "not found"){ flowOrder = "TACG"; }
312 else if(flowOrder.length() != 4){
313 m->mothurOut("The value of the order option must be four bases long\n");
318 catch(exception& e) {
319 m->errorOut(e, "ShhherCommand", "ShhherCommand");
323 //**********************************************************************************************************************
325 int ShhherCommand::execute(){
327 if (abort == true) { if (calledHelp) { return 0; } return 2; }
331 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
332 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
336 for(int i=1;i<ncpus;i++){
337 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
339 if(abort == 1){ return 0; }
343 m->mothurOut("\nGetting preliminary data...\n");
344 getSingleLookUp(); if (m->control_pressed) { return 0; }
345 getJointLookUp(); if (m->control_pressed) { return 0; }
347 int numFiles = flowFileVector.size();
349 for(int i=1;i<ncpus;i++){
350 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
353 for(int i=0;i<numFiles;i++){
355 if (m->control_pressed) { break; }
357 double begClock = clock();
358 unsigned long long begTime = time(NULL);
360 flowFileName = flowFileVector[i];
362 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
363 m->mothurOut("Reading flowgrams...\n");
366 if (m->control_pressed) { break; }
368 m->mothurOut("Identifying unique flowgrams...\n");
371 if (m->control_pressed) { break; }
373 m->mothurOut("Calculating distances between flowgrams...\n");
375 strcpy(fileName, flowFileName.c_str());
377 for(int i=1;i<ncpus;i++){
378 MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
380 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
381 MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
382 MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
383 MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
384 MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
385 MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
386 MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
387 MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
388 MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
389 MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
392 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
394 if (m->control_pressed) { break; }
397 for(int i=1;i<ncpus;i++){
398 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
400 m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
401 m->mothurRemove((distFileName + ".temp." + toString(i)));
404 string namesFileName = createNamesFile();
406 if (m->control_pressed) { break; }
408 m->mothurOut("\nClustering flowgrams...\n");
409 string listFileName = cluster(distFileName, namesFileName);
411 if (m->control_pressed) { break; }
413 getOTUData(listFileName);
415 m->mothurRemove(distFileName);
416 m->mothurRemove(namesFileName);
417 m->mothurRemove(listFileName);
419 if (m->control_pressed) { break; }
423 if (m->control_pressed) { break; }
425 for(int i=1;i<ncpus;i++){
426 MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
427 MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
428 MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
429 MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
432 if (m->control_pressed) { break; }
437 int numOTUsOnCPU = numOTUs / ncpus;
438 int numSeqsOnCPU = numSeqs / ncpus;
439 m->mothurOut("\nDenoising flowgrams...\n");
440 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
442 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
444 double cycClock = clock();
445 unsigned long long cycTime = time(NULL);
448 if (m->control_pressed) { break; }
450 int total = singleTau.size();
451 for(int i=1;i<ncpus;i++){
452 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
453 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
454 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
456 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
457 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
458 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
459 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
460 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
463 calcCentroidsDriver(0, numOTUsOnCPU);
465 for(int i=1;i<ncpus;i++){
466 int otuStart = i * numOTUs / ncpus;
467 int otuStop = (i + 1) * numOTUs / ncpus;
469 vector<int> tempCentroids(numOTUs, 0);
470 vector<short> tempChange(numOTUs, 0);
472 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
473 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
475 for(int j=otuStart;j<otuStop;j++){
476 centroids[j] = tempCentroids[j];
477 change[j] = tempChange[j];
481 maxDelta = getNewWeights(); if (m->control_pressed) { break; }
482 double nLL = getLikelihood(); if (m->control_pressed) { break; }
483 checkCentroids(); if (m->control_pressed) { break; }
485 for(int i=1;i<ncpus;i++){
486 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
487 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
488 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
491 calcNewDistancesParent(0, numSeqsOnCPU);
493 total = singleTau.size();
495 for(int i=1;i<ncpus;i++){
497 int seqStart = i * numSeqs / ncpus;
498 int seqStop = (i + 1) * numSeqs / ncpus;
500 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
502 vector<int> childSeqIndex(childTotal, 0);
503 vector<double> childSingleTau(childTotal, 0);
504 vector<double> childDist(numSeqs * numOTUs, 0);
505 vector<int> otuIndex(childTotal, 0);
507 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
508 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
509 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
510 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
512 int oldTotal = total;
514 singleTau.resize(total, 0);
515 seqIndex.resize(total, 0);
516 seqNumber.resize(total, 0);
520 for(int j=oldTotal;j<total;j++){
521 int otuI = otuIndex[childIndex];
522 int seqI = childSeqIndex[childIndex];
524 singleTau[j] = childSingleTau[childIndex];
526 aaP[otuI][nSeqsPerOTU[otuI]] = j;
527 aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
532 int index = seqStart * numOTUs;
533 for(int j=seqStart;j<seqStop;j++){
534 for(int k=0;k<numOTUs;k++){
535 dist[index] = childDist[index];
543 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
545 if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
547 for(int i=1;i<ncpus;i++){
548 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
553 for(int i=1;i<ncpus;i++){
554 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
560 if (m->control_pressed) { break; }
562 m->mothurOut("\nFinalizing...\n");
565 if (m->control_pressed) { break; }
569 vector<int> otuCounts(numOTUs, 0);
570 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
571 calcCentroidsDriver(0, numOTUs);
573 if (m->control_pressed) { break; }
575 writeQualities(otuCounts); if (m->control_pressed) { break; }
576 writeSequences(otuCounts); if (m->control_pressed) { break; }
577 writeNames(otuCounts); if (m->control_pressed) { break; }
578 writeClusters(otuCounts); if (m->control_pressed) { break; }
579 writeGroups(); if (m->control_pressed) { break; }
582 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
588 MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
589 if(abort){ return 0; }
592 MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
594 for(int i=0;i<numFiles;i++){
596 if (m->control_pressed) { break; }
598 //Now into the pyrodist part
602 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
603 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
604 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
605 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
607 flowDataIntI.resize(numSeqs * numFlowCells);
608 flowDataPrI.resize(numSeqs * numFlowCells);
609 mapUniqueToSeq.resize(numSeqs);
610 mapSeqToUnique.resize(numSeqs);
611 lengths.resize(numSeqs);
612 jointLookUp.resize(NUMBINS * NUMBINS);
614 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
615 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
616 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
617 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
618 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
619 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
620 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
622 flowFileName = string(fileName);
623 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
624 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
626 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
628 if (m->control_pressed) { break; }
631 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
633 //Now into the pyronoise part
634 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
636 singleLookUp.resize(HOMOPS * NUMBINS);
637 uniqueFlowgrams.resize(numUniques * numFlowCells);
638 weight.resize(numOTUs);
639 centroids.resize(numOTUs);
640 change.resize(numOTUs);
641 dist.assign(numOTUs * numSeqs, 0);
642 nSeqsPerOTU.resize(numOTUs);
643 cumNumSeqs.resize(numOTUs);
645 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
646 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
647 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
649 int startOTU = pid * numOTUs / ncpus;
650 int endOTU = (pid + 1) * numOTUs / ncpus;
652 int startSeq = pid * numSeqs / ncpus;
653 int endSeq = (pid + 1) * numSeqs /ncpus;
659 if (m->control_pressed) { break; }
661 MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
662 singleTau.assign(total, 0.0000);
663 seqNumber.assign(total, 0);
664 seqIndex.assign(total, 0);
666 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
667 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
668 MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
669 MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
670 MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
671 MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
672 MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
674 calcCentroidsDriver(startOTU, endOTU);
676 MPI_Send(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
677 MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
679 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
680 MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
681 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
683 vector<int> otuIndex(total, 0);
684 calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
685 total = otuIndex.size();
687 MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
688 MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
689 MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
690 MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
691 MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
693 MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
700 MPI_Barrier(MPI_COMM_WORLD);
703 if(compositeFASTAFileName != ""){
704 outputNames.push_back(compositeFASTAFileName);
705 outputNames.push_back(compositeNamesFileName);
708 m->mothurOutEndLine();
709 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
710 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
711 m->mothurOutEndLine();
716 catch(exception& e) {
717 m->errorOut(e, "ShhherCommand", "execute");
721 /**************************************************************************************************/
723 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
725 ostringstream outStream;
726 outStream.setf(ios::fixed, ios::floatfield);
727 outStream.setf(ios::dec, ios::basefield);
728 outStream.setf(ios::showpoint);
729 outStream.precision(6);
731 int begTime = time(NULL);
732 double begClock = clock();
734 for(int i=startSeq;i<stopSeq;i++){
736 if (m->control_pressed) { break; }
738 for(int j=0;j<i;j++){
739 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
741 if(flowDistance < 1e-6){
742 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
744 else if(flowDistance <= cutoff){
745 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
749 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
753 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
754 if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
756 if (m->control_pressed) { return fDistFileName; }
758 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
760 ofstream distFile(fDistFileName.c_str());
761 distFile << outStream.str();
764 return fDistFileName;
766 catch(exception& e) {
767 m->errorOut(e, "ShhherCommand", "flowDistMPI");
771 /**************************************************************************************************/
773 void ShhherCommand::getOTUData(string listFileName){
777 m->openInputFile(listFileName, listFile);
780 listFile >> label >> numOTUs;
782 otuData.assign(numSeqs, 0);
783 cumNumSeqs.assign(numOTUs, 0);
784 nSeqsPerOTU.assign(numOTUs, 0);
785 aaP.clear();aaP.resize(numOTUs);
791 string singleOTU = "";
793 for(int i=0;i<numOTUs;i++){
795 if (m->control_pressed) { break; }
797 listFile >> singleOTU;
799 istringstream otuString(singleOTU);
805 for(int j=0;j<singleOTU.length();j++){
806 char letter = otuString.get();
812 map<string,int>::iterator nmIt = nameMap.find(seqName);
813 int index = nmIt->second;
819 aaP[i].push_back(index);
824 map<string,int>::iterator nmIt = nameMap.find(seqName);
826 int index = nmIt->second;
831 aaP[i].push_back(index);
836 sort(aaP[i].begin(), aaP[i].end());
837 for(int j=0;j<nSeqsPerOTU[i];j++){
838 seqNumber.push_back(aaP[i][j]);
840 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
847 for(int i=1;i<numOTUs;i++){
848 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
851 seqIndex = seqNumber;
856 catch(exception& e) {
857 m->errorOut(e, "ShhherCommand", "getOTUData");
862 /**************************************************************************************************/
864 void ShhherCommand::initPyroCluster(){
866 if (numOTUs < processors) { processors = 1; }
868 dist.assign(numSeqs * numOTUs, 0);
869 change.assign(numOTUs, 1);
870 centroids.assign(numOTUs, -1);
871 weight.assign(numOTUs, 0);
872 singleTau.assign(numSeqs, 1.0);
874 nSeqsBreaks.assign(processors+1, 0);
875 nOTUsBreaks.assign(processors+1, 0);
878 for(int i=0;i<processors;i++){
879 nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
880 nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
882 nSeqsBreaks[processors] = numSeqs;
883 nOTUsBreaks[processors] = numOTUs;
885 catch(exception& e) {
886 m->errorOut(e, "ShhherCommand", "initPyroCluster");
891 /**************************************************************************************************/
893 void ShhherCommand::fill(){
896 for(int i=0;i<numOTUs;i++){
898 if (m->control_pressed) { break; }
900 cumNumSeqs[i] = index;
901 for(int j=0;j<nSeqsPerOTU[i];j++){
902 seqNumber[index] = aaP[i][j];
903 seqIndex[index] = aaI[i][j];
909 catch(exception& e) {
910 m->errorOut(e, "ShhherCommand", "fill");
915 /**************************************************************************************************/
917 void ShhherCommand::getFlowData(){
920 m->openInputFile(flowFileName, flowFile);
923 seqNameVector.clear();
925 flowDataIntI.clear();
929 int currentNumFlowCells;
933 flowFile >> numFlowCells;
934 int index = 0;//pcluster
935 while(!flowFile.eof()){
937 if (m->control_pressed) { break; }
939 flowFile >> seqName >> currentNumFlowCells;
940 lengths.push_back(currentNumFlowCells);
942 seqNameVector.push_back(seqName);
943 nameMap[seqName] = index++;//pcluster
945 for(int i=0;i<numFlowCells;i++){
946 flowFile >> intensity;
947 if(intensity > 9.99) { intensity = 9.99; }
948 int intI = int(100 * intensity + 0.0001);
949 flowDataIntI.push_back(intI);
955 numSeqs = seqNameVector.size();
957 for(int i=0;i<numSeqs;i++){
959 if (m->control_pressed) { break; }
961 int iNumFlowCells = i * numFlowCells;
962 for(int j=lengths[i];j<numFlowCells;j++){
963 flowDataIntI[iNumFlowCells + j] = 0;
968 catch(exception& e) {
969 m->errorOut(e, "ShhherCommand", "getFlowData");
973 /**************************************************************************************************/
974 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
977 vector<double> newTau(numOTUs,0);
978 vector<double> norms(numSeqs, 0);
983 for(int i=startSeq;i<stopSeq;i++){
985 if (m->control_pressed) { break; }
988 int indexOffset = i * numOTUs;
990 for(int j=0;j<numOTUs;j++){
992 if(weight[j] > MIN_WEIGHT && change[j] == 1){
993 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
995 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
996 offset = dist[indexOffset + j];
1000 for(int j=0;j<numOTUs;j++){
1001 if(weight[j] > MIN_WEIGHT){
1002 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1003 norms[i] += newTau[j];
1010 for(int j=0;j<numOTUs;j++){
1012 newTau[j] /= norms[i];
1014 if(newTau[j] > MIN_TAU){
1015 otuIndex.push_back(j);
1016 seqIndex.push_back(i);
1017 singleTau.push_back(newTau[j]);
1023 catch(exception& e) {
1024 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1029 /**************************************************************************************************/
1031 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1036 vector<double> newTau(numOTUs,0);
1037 vector<double> norms(numSeqs, 0);
1038 nSeqsPerOTU.assign(numOTUs, 0);
1040 for(int i=startSeq;i<stopSeq;i++){
1042 if (m->control_pressed) { break; }
1044 int indexOffset = i * numOTUs;
1046 double offset = 1e8;
1048 for(int j=0;j<numOTUs;j++){
1050 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1051 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1054 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1055 offset = dist[indexOffset + j];
1059 for(int j=0;j<numOTUs;j++){
1060 if(weight[j] > MIN_WEIGHT){
1061 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1062 norms[i] += newTau[j];
1069 for(int j=0;j<numOTUs;j++){
1070 newTau[j] /= norms[i];
1073 for(int j=0;j<numOTUs;j++){
1074 if(newTau[j] > MIN_TAU){
1076 int oldTotal = total;
1080 singleTau.resize(total, 0);
1081 seqNumber.resize(total, 0);
1082 seqIndex.resize(total, 0);
1084 singleTau[oldTotal] = newTau[j];
1086 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1087 aaI[j][nSeqsPerOTU[j]] = i;
1095 catch(exception& e) {
1096 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1101 /**************************************************************************************************/
1103 void ShhherCommand::setOTUs(){
1106 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1108 for(int i=0;i<numOTUs;i++){
1110 if (m->control_pressed) { break; }
1112 for(int j=0;j<nSeqsPerOTU[i];j++){
1113 int index = cumNumSeqs[i] + j;
1114 double tauValue = singleTau[seqNumber[index]];
1115 int sIndex = seqIndex[index];
1116 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1120 for(int i=0;i<numSeqs;i++){
1121 double maxTau = -1.0000;
1123 for(int j=0;j<numOTUs;j++){
1124 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1125 maxTau = bigTauMatrix[i * numOTUs + j];
1130 otuData[i] = maxOTU;
1133 nSeqsPerOTU.assign(numOTUs, 0);
1135 for(int i=0;i<numSeqs;i++){
1136 int index = otuData[i];
1138 singleTau[i] = 1.0000;
1141 aaP[index][nSeqsPerOTU[index]] = i;
1142 aaI[index][nSeqsPerOTU[index]] = i;
1144 nSeqsPerOTU[index]++;
1148 catch(exception& e) {
1149 m->errorOut(e, "ShhherCommand", "setOTUs");
1154 /**************************************************************************************************/
1156 void ShhherCommand::getUniques(){
1161 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1162 uniqueCount.assign(numSeqs, 0); // anWeights
1163 uniqueLengths.assign(numSeqs, 0);
1164 mapSeqToUnique.assign(numSeqs, -1);
1165 mapUniqueToSeq.assign(numSeqs, -1);
1167 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1169 for(int i=0;i<numSeqs;i++){
1171 if (m->control_pressed) { break; }
1175 vector<short> current(numFlowCells);
1176 for(int j=0;j<numFlowCells;j++){
1177 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1180 for(int j=0;j<numUniques;j++){
1181 int offset = j * numFlowCells;
1185 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
1186 else { shorterLength = uniqueLengths[j]; }
1188 for(int k=0;k<shorterLength;k++){
1189 if(current[k] != uniqueFlowgrams[offset + k]){
1196 mapSeqToUnique[i] = j;
1199 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
1205 if(index == numUniques){
1206 uniqueLengths[numUniques] = lengths[i];
1207 uniqueCount[numUniques] = 1;
1208 mapSeqToUnique[i] = numUniques;//anMap
1209 mapUniqueToSeq[numUniques] = i;//anF
1211 for(int k=0;k<numFlowCells;k++){
1212 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1213 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1219 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1220 uniqueLengths.resize(numUniques);
1222 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1223 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
1225 catch(exception& e) {
1226 m->errorOut(e, "ShhherCommand", "getUniques");
1231 /**************************************************************************************************/
1233 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1235 int minLength = lengths[mapSeqToUnique[seqA]];
1236 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
1238 int ANumFlowCells = seqA * numFlowCells;
1239 int BNumFlowCells = seqB * numFlowCells;
1243 for(int i=0;i<minLength;i++){
1245 if (m->control_pressed) { break; }
1247 int flowAIntI = flowDataIntI[ANumFlowCells + i];
1248 float flowAPrI = flowDataPrI[ANumFlowCells + i];
1250 int flowBIntI = flowDataIntI[BNumFlowCells + i];
1251 float flowBPrI = flowDataPrI[BNumFlowCells + i];
1252 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1255 dist /= (float) minLength;
1258 catch(exception& e) {
1259 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1264 //**********************************************************************************************************************/
1266 string ShhherCommand::cluster(string distFileName, string namesFileName){
1269 ReadMatrix* read = new ReadColumnMatrix(distFileName);
1270 read->setCutoff(cutoff);
1272 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1273 clusterNameMap->readMap();
1274 read->read(clusterNameMap);
1276 ListVector* list = read->getListVector();
1277 SparseMatrix* matrix = read->getMatrix();
1280 delete clusterNameMap;
1282 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1284 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
1285 string tag = cluster->getTag();
1287 double clusterCutoff = cutoff;
1288 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1290 if (m->control_pressed) { break; }
1292 cluster->update(clusterCutoff);
1295 list->setLabel(toString(cutoff));
1297 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1299 m->openOutputFile(listFileName, listFile);
1300 list->print(listFile);
1303 delete matrix; delete cluster; delete rabund; delete list;
1305 return listFileName;
1307 catch(exception& e) {
1308 m->errorOut(e, "ShhherCommand", "cluster");
1313 /**************************************************************************************************/
1315 void ShhherCommand::calcCentroidsDriver(int start, int finish){
1317 //this function gets the most likely homopolymer length at a flow position for a group of sequences
1322 for(int i=start;i<finish;i++){
1324 if (m->control_pressed) { break; }
1328 int minFlowGram = 100000000;
1329 double minFlowValue = 1e8;
1330 change[i] = 0; //FALSE
1332 for(int j=0;j<nSeqsPerOTU[i];j++){
1333 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1336 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1337 vector<double> adF(nSeqsPerOTU[i]);
1338 vector<int> anL(nSeqsPerOTU[i]);
1340 for(int j=0;j<nSeqsPerOTU[i];j++){
1341 int index = cumNumSeqs[i] + j;
1342 int nI = seqIndex[index];
1343 int nIU = mapSeqToUnique[nI];
1346 for(k=0;k<position;k++){
1352 anL[position] = nIU;
1353 adF[position] = 0.0000;
1358 for(int j=0;j<nSeqsPerOTU[i];j++){
1359 int index = cumNumSeqs[i] + j;
1360 int nI = seqIndex[index];
1362 double tauValue = singleTau[seqNumber[index]];
1364 for(int k=0;k<position;k++){
1365 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1366 adF[k] += dist * tauValue;
1370 for(int j=0;j<position;j++){
1371 if(adF[j] < minFlowValue){
1373 minFlowValue = adF[j];
1377 if(centroids[i] != anL[minFlowGram]){
1379 centroids[i] = anL[minFlowGram];
1382 else if(centroids[i] != -1){
1388 catch(exception& e) {
1389 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1394 /**************************************************************************************************/
1396 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1399 int flowAValue = cent * numFlowCells;
1400 int flowBValue = flow * numFlowCells;
1404 for(int i=0;i<length;i++){
1405 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1410 return dist / (double)length;
1412 catch(exception& e) {
1413 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1418 /**************************************************************************************************/
1420 double ShhherCommand::getNewWeights(){
1423 double maxChange = 0;
1425 for(int i=0;i<numOTUs;i++){
1427 if (m->control_pressed) { break; }
1429 double difference = weight[i];
1432 for(int j=0;j<nSeqsPerOTU[i];j++){
1433 int index = cumNumSeqs[i] + j;
1434 double tauValue = singleTau[seqNumber[index]];
1435 weight[i] += tauValue;
1438 difference = fabs(weight[i] - difference);
1439 if(difference > maxChange){ maxChange = difference; }
1443 catch(exception& e) {
1444 m->errorOut(e, "ShhherCommand", "getNewWeights");
1449 /**************************************************************************************************/
1451 double ShhherCommand::getLikelihood(){
1455 vector<long double> P(numSeqs, 0);
1458 for(int i=0;i<numOTUs;i++){
1459 if(weight[i] > MIN_WEIGHT){
1465 for(int i=0;i<numOTUs;i++){
1467 if (m->control_pressed) { break; }
1469 for(int j=0;j<nSeqsPerOTU[i];j++){
1470 int index = cumNumSeqs[i] + j;
1471 int nI = seqIndex[index];
1472 double singleDist = dist[seqNumber[index]];
1474 P[nI] += weight[i] * exp(-singleDist * sigma);
1478 for(int i=0;i<numSeqs;i++){
1479 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1484 nLL = nLL -(double)numSeqs * log(sigma);
1488 catch(exception& e) {
1489 m->errorOut(e, "ShhherCommand", "getNewWeights");
1494 /**************************************************************************************************/
1496 void ShhherCommand::checkCentroids(){
1498 vector<int> unique(numOTUs, 1);
1500 for(int i=0;i<numOTUs;i++){
1501 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1506 for(int i=0;i<numOTUs;i++){
1508 if (m->control_pressed) { break; }
1511 for(int j=i+1;j<numOTUs;j++){
1514 if(centroids[j] == centroids[i]){
1518 weight[i] += weight[j];
1526 catch(exception& e) {
1527 m->errorOut(e, "ShhherCommand", "checkCentroids");
1531 /**************************************************************************************************/
1535 void ShhherCommand::writeQualities(vector<int> otuCounts){
1538 string thisOutputDir = outputDir;
1539 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1540 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1542 ofstream qualityFile;
1543 m->openOutputFile(qualityFileName, qualityFile);
1545 qualityFile.setf(ios::fixed, ios::floatfield);
1546 qualityFile.setf(ios::showpoint);
1547 qualityFile << setprecision(6);
1549 vector<vector<int> > qualities(numOTUs);
1550 vector<double> pr(HOMOPS, 0);
1553 for(int i=0;i<numOTUs;i++){
1555 if (m->control_pressed) { break; }
1560 if(nSeqsPerOTU[i] > 0){
1561 qualities[i].assign(1024, -1);
1563 while(index < numFlowCells){
1564 double maxPrValue = 1e8;
1565 short maxPrIndex = -1;
1566 double count = 0.0000;
1568 pr.assign(HOMOPS, 0);
1570 for(int j=0;j<nSeqsPerOTU[i];j++){
1571 int lIndex = cumNumSeqs[i] + j;
1572 double tauValue = singleTau[seqNumber[lIndex]];
1573 int sequenceIndex = aaI[i][j];
1574 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1578 for(int s=0;s<HOMOPS;s++){
1579 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1583 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1584 maxPrValue = pr[maxPrIndex];
1586 if(count > MIN_COUNT){
1588 double norm = 0.0000;
1590 for(int s=0;s<HOMOPS;s++){
1591 norm += exp(-(pr[s] - maxPrValue));
1594 for(int s=1;s<=maxPrIndex;s++){
1596 double temp = 0.0000;
1598 U += exp(-(pr[s-1]-maxPrValue))/norm;
1606 temp = floor(-10 * temp);
1607 value = (int)floor(temp);
1608 if(value > 100){ value = 100; }
1610 qualities[i][base] = (int)value;
1620 if(otuCounts[i] > 0){
1621 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1623 int j=4; //need to get past the first four bases
1624 while(qualities[i][j] != -1){
1625 qualityFile << qualities[i][j] << ' ';
1628 qualityFile << endl;
1631 qualityFile.close();
1632 outputNames.push_back(qualityFileName);
1635 catch(exception& e) {
1636 m->errorOut(e, "ShhherCommand", "writeQualities");
1641 /**************************************************************************************************/
1643 void ShhherCommand::writeSequences(vector<int> otuCounts){
1645 string thisOutputDir = outputDir;
1646 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1647 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1649 m->openOutputFile(fastaFileName, fastaFile);
1651 vector<string> names(numOTUs, "");
1653 for(int i=0;i<numOTUs;i++){
1655 if (m->control_pressed) { break; }
1657 int index = centroids[i];
1659 if(otuCounts[i] > 0){
1660 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1664 for(int j=0;j<numFlowCells;j++){
1666 char base = flowOrder[j % 4];
1667 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1672 fastaFile << newSeq.substr(4) << endl;
1677 outputNames.push_back(fastaFileName);
1679 if(compositeFASTAFileName != ""){
1680 m->appendFiles(fastaFileName, compositeFASTAFileName);
1683 catch(exception& e) {
1684 m->errorOut(e, "ShhherCommand", "writeSequences");
1689 /**************************************************************************************************/
1691 void ShhherCommand::writeNames(vector<int> otuCounts){
1693 string thisOutputDir = outputDir;
1694 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1695 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1697 m->openOutputFile(nameFileName, nameFile);
1699 for(int i=0;i<numOTUs;i++){
1701 if (m->control_pressed) { break; }
1703 if(otuCounts[i] > 0){
1704 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1706 for(int j=1;j<nSeqsPerOTU[i];j++){
1707 nameFile << ',' << seqNameVector[aaI[i][j]];
1714 outputNames.push_back(nameFileName);
1717 if(compositeNamesFileName != ""){
1718 m->appendFiles(nameFileName, compositeNamesFileName);
1721 catch(exception& e) {
1722 m->errorOut(e, "ShhherCommand", "writeNames");
1727 /**************************************************************************************************/
1729 void ShhherCommand::writeGroups(){
1731 string thisOutputDir = outputDir;
1732 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1733 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1734 string groupFileName = fileRoot + "shhh.groups";
1736 m->openOutputFile(groupFileName, groupFile);
1738 for(int i=0;i<numSeqs;i++){
1739 if (m->control_pressed) { break; }
1740 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1743 outputNames.push_back(groupFileName);
1746 catch(exception& e) {
1747 m->errorOut(e, "ShhherCommand", "writeGroups");
1752 /**************************************************************************************************/
1754 void ShhherCommand::writeClusters(vector<int> otuCounts){
1756 string thisOutputDir = outputDir;
1757 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1758 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1759 ofstream otuCountsFile;
1760 m->openOutputFile(otuCountsFileName, otuCountsFile);
1762 string bases = flowOrder;
1764 for(int i=0;i<numOTUs;i++){
1766 if (m->control_pressed) {
1769 //output the translated version of the centroid sequence for the otu
1770 if(otuCounts[i] > 0){
1771 int index = centroids[i];
1773 otuCountsFile << "ideal\t";
1774 for(int j=8;j<numFlowCells;j++){
1775 char base = bases[j % 4];
1776 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1777 otuCountsFile << base;
1780 otuCountsFile << endl;
1782 for(int j=0;j<nSeqsPerOTU[i];j++){
1783 int sequence = aaI[i][j];
1784 otuCountsFile << seqNameVector[sequence] << '\t';
1788 for(int k=0;k<lengths[sequence];k++){
1789 char base = bases[k % 4];
1790 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1792 for(int s=0;s<freq;s++){
1794 //otuCountsFile << base;
1797 otuCountsFile << newSeq.substr(4) << endl;
1799 otuCountsFile << endl;
1802 otuCountsFile.close();
1803 outputNames.push_back(otuCountsFileName);
1806 catch(exception& e) {
1807 m->errorOut(e, "ShhherCommand", "writeClusters");
1813 //**********************************************************************************************************************
1815 int ShhherCommand::execute(){
1817 if (abort == true) { return 0; }
1819 getSingleLookUp(); if (m->control_pressed) { return 0; }
1820 getJointLookUp(); if (m->control_pressed) { return 0; }
1822 int numFiles = flowFileVector.size();
1824 if (numFiles < processors) { processors = numFiles; }
1826 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1827 if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1828 else { createProcesses(flowFileVector); } //each processor processes one file
1830 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1833 if(compositeFASTAFileName != ""){
1834 outputNames.push_back(compositeFASTAFileName);
1835 outputNames.push_back(compositeNamesFileName);
1838 m->mothurOutEndLine();
1839 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1840 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
1841 m->mothurOutEndLine();
1845 catch(exception& e) {
1846 m->errorOut(e, "ShhherCommand", "execute");
1851 /**************************************************************************************************/
1853 int ShhherCommand::createProcesses(vector<string> filenames){
1855 vector<int> processIDS;
1860 if (filenames.size() < processors) { processors = filenames.size(); }
1862 //divide the groups between the processors
1863 vector<linePair> lines;
1864 int numFilesPerProcessor = filenames.size() / processors;
1865 for (int i = 0; i < processors; i++) {
1866 int startIndex = i * numFilesPerProcessor;
1867 int endIndex = (i+1) * numFilesPerProcessor;
1868 if(i == (processors - 1)){ endIndex = filenames.size(); }
1869 lines.push_back(linePair(startIndex, endIndex));
1872 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1874 //loop through and create all the processes you want
1875 while (process != processors) {
1879 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1881 }else if (pid == 0){
1882 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1885 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1886 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1892 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1894 //force parent to wait until all the processes are done
1895 for (int i=0;i<processIDS.size();i++) {
1896 int temp = processIDS[i];
1902 //////////////////////////////////////////////////////////////////////////////////////////////////////
1904 /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1906 //////////////////////////////////////////////////////////////////////////////////////////////////////
1907 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
1908 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1909 //////////////////////////////////////////////////////////////////////////////////////////////////////
1911 vector<shhhFlowsData*> pDataArray;
1912 DWORD dwThreadIdArray[processors-1];
1913 HANDLE hThreadArray[processors-1];
1915 //Create processor worker threads.
1916 for( int i=0; i<processors-1; i++ ){
1917 // Allocate memory for thread data.
1918 string extension = "";
1919 if (i != 0) { extension = toString(i) + ".temp"; }
1921 shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1922 pDataArray.push_back(tempFlow);
1923 processIDS.push_back(i);
1925 hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1928 //using the main process as a worker saves time and memory
1930 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1932 //Wait until all threads have terminated.
1933 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1935 //Close all thread handles and free memory allocations.
1936 for(int i=0; i < pDataArray.size(); i++){
1937 for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1938 CloseHandle(hThreadArray[i]);
1939 delete pDataArray[i];
1944 for (int i=0;i<processIDS.size();i++) {
1945 if (compositeFASTAFileName != "") {
1946 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
1947 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
1948 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
1949 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
1956 catch(exception& e) {
1957 m->errorOut(e, "ShhherCommand", "createProcesses");
1961 /**************************************************************************************************/
1963 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
1966 for(int i=start;i<end;i++){
1968 if (m->control_pressed) { break; }
1970 string flowFileName = filenames[i];
1972 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
1973 m->mothurOut("Reading flowgrams...\n");
1975 vector<string> seqNameVector;
1976 vector<int> lengths;
1977 vector<short> flowDataIntI;
1978 vector<double> flowDataPrI;
1979 map<string, int> nameMap;
1980 vector<short> uniqueFlowgrams;
1981 vector<int> uniqueCount;
1982 vector<int> mapSeqToUnique;
1983 vector<int> mapUniqueToSeq;
1984 vector<int> uniqueLengths;
1987 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
1989 if (m->control_pressed) { break; }
1991 m->mothurOut("Identifying unique flowgrams...\n");
1992 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
1994 if (m->control_pressed) { break; }
1996 m->mothurOut("Calculating distances between flowgrams...\n");
1997 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
1998 unsigned long long begTime = time(NULL);
1999 double begClock = clock();
2001 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2003 m->mothurOutEndLine();
2004 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2007 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2008 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2010 if (m->control_pressed) { break; }
2012 m->mothurOut("\nClustering flowgrams...\n");
2013 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2014 cluster(listFileName, distFileName, namesFileName);
2016 if (m->control_pressed) { break; }
2018 vector<int> otuData;
2019 vector<int> cumNumSeqs;
2020 vector<int> nSeqsPerOTU;
2021 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2022 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2023 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
2024 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
2027 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2029 if (m->control_pressed) { break; }
2031 m->mothurRemove(distFileName);
2032 m->mothurRemove(namesFileName);
2033 m->mothurRemove(listFileName);
2035 vector<double> dist; //adDist - distance of sequences to centroids
2036 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
2037 vector<int> centroids; //the representative flowgram for each cluster m
2038 vector<double> weight;
2039 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2040 vector<int> nSeqsBreaks;
2041 vector<int> nOTUsBreaks;
2043 dist.assign(numSeqs * numOTUs, 0);
2044 change.assign(numOTUs, 1);
2045 centroids.assign(numOTUs, -1);
2046 weight.assign(numOTUs, 0);
2047 singleTau.assign(numSeqs, 1.0);
2049 nSeqsBreaks.assign(2, 0);
2050 nOTUsBreaks.assign(2, 0);
2053 nSeqsBreaks[1] = numSeqs;
2054 nOTUsBreaks[1] = numOTUs;
2056 if (m->control_pressed) { break; }
2058 double maxDelta = 0;
2062 begTime = time(NULL);
2064 m->mothurOut("\nDenoising flowgrams...\n");
2065 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2067 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2069 if (m->control_pressed) { break; }
2071 double cycClock = clock();
2072 unsigned long long cycTime = time(NULL);
2073 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2075 if (m->control_pressed) { break; }
2077 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2079 if (m->control_pressed) { break; }
2081 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
2083 if (m->control_pressed) { break; }
2085 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
2087 if (m->control_pressed) { break; }
2089 checkCentroids(numOTUs, centroids, weight);
2091 if (m->control_pressed) { break; }
2093 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2095 if (m->control_pressed) { break; }
2099 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2103 if (m->control_pressed) { break; }
2105 m->mothurOut("\nFinalizing...\n");
2106 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2108 if (m->control_pressed) { break; }
2110 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2112 if (m->control_pressed) { break; }
2114 vector<int> otuCounts(numOTUs, 0);
2115 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
2117 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2119 if (m->control_pressed) { break; }
2121 writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2122 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2123 writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
2124 writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
2125 writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; }
2127 m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2130 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2134 }catch(exception& e) {
2135 m->errorOut(e, "ShhherCommand", "driver");
2140 /**************************************************************************************************/
2141 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2146 m->openInputFile(filename, flowFile);
2149 int currentNumFlowCells;
2151 thisSeqNameVector.clear();
2152 thisLengths.clear();
2153 thisFlowDataIntI.clear();
2154 thisNameMap.clear();
2156 flowFile >> numFlowCells;
2157 int index = 0;//pcluster
2158 while(!flowFile.eof()){
2160 if (m->control_pressed) { break; }
2162 flowFile >> seqName >> currentNumFlowCells;
2163 thisLengths.push_back(currentNumFlowCells);
2165 thisSeqNameVector.push_back(seqName);
2166 thisNameMap[seqName] = index++;//pcluster
2168 for(int i=0;i<numFlowCells;i++){
2169 flowFile >> intensity;
2170 if(intensity > 9.99) { intensity = 9.99; }
2171 int intI = int(100 * intensity + 0.0001);
2172 thisFlowDataIntI.push_back(intI);
2174 m->gobble(flowFile);
2178 int numSeqs = thisSeqNameVector.size();
2180 for(int i=0;i<numSeqs;i++){
2182 if (m->control_pressed) { break; }
2184 int iNumFlowCells = i * numFlowCells;
2185 for(int j=thisLengths[i];j<numFlowCells;j++){
2186 thisFlowDataIntI[iNumFlowCells + j] = 0;
2193 catch(exception& e) {
2194 m->errorOut(e, "ShhherCommand", "getFlowData");
2198 /**************************************************************************************************/
2200 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2203 ostringstream outStream;
2204 outStream.setf(ios::fixed, ios::floatfield);
2205 outStream.setf(ios::dec, ios::basefield);
2206 outStream.setf(ios::showpoint);
2207 outStream.precision(6);
2209 int begTime = time(NULL);
2210 double begClock = clock();
2212 for(int i=0;i<stopSeq;i++){
2214 if (m->control_pressed) { break; }
2216 for(int j=0;j<i;j++){
2217 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2219 if(flowDistance < 1e-6){
2220 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2222 else if(flowDistance <= cutoff){
2223 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2227 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2228 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2229 m->mothurOutEndLine();
2233 ofstream distFile(distFileName.c_str());
2234 distFile << outStream.str();
2237 if (m->control_pressed) {}
2239 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2240 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2241 m->mothurOutEndLine();
2246 catch(exception& e) {
2247 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2251 /**************************************************************************************************/
2253 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2255 int minLength = lengths[mapSeqToUnique[seqA]];
2256 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
2258 int ANumFlowCells = seqA * numFlowCells;
2259 int BNumFlowCells = seqB * numFlowCells;
2263 for(int i=0;i<minLength;i++){
2265 if (m->control_pressed) { break; }
2267 int flowAIntI = flowDataIntI[ANumFlowCells + i];
2268 float flowAPrI = flowDataPrI[ANumFlowCells + i];
2270 int flowBIntI = flowDataIntI[BNumFlowCells + i];
2271 float flowBPrI = flowDataPrI[BNumFlowCells + i];
2272 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2275 dist /= (float) minLength;
2278 catch(exception& e) {
2279 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2284 /**************************************************************************************************/
2286 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2289 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2290 uniqueCount.assign(numSeqs, 0); // anWeights
2291 uniqueLengths.assign(numSeqs, 0);
2292 mapSeqToUnique.assign(numSeqs, -1);
2293 mapUniqueToSeq.assign(numSeqs, -1);
2295 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2297 for(int i=0;i<numSeqs;i++){
2299 if (m->control_pressed) { break; }
2303 vector<short> current(numFlowCells);
2304 for(int j=0;j<numFlowCells;j++){
2305 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2308 for(int j=0;j<numUniques;j++){
2309 int offset = j * numFlowCells;
2313 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
2314 else { shorterLength = uniqueLengths[j]; }
2316 for(int k=0;k<shorterLength;k++){
2317 if(current[k] != uniqueFlowgrams[offset + k]){
2324 mapSeqToUnique[i] = j;
2327 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
2333 if(index == numUniques){
2334 uniqueLengths[numUniques] = lengths[i];
2335 uniqueCount[numUniques] = 1;
2336 mapSeqToUnique[i] = numUniques;//anMap
2337 mapUniqueToSeq[numUniques] = i;//anF
2339 for(int k=0;k<numFlowCells;k++){
2340 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2341 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2347 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2348 uniqueLengths.resize(numUniques);
2350 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2351 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
2355 catch(exception& e) {
2356 m->errorOut(e, "ShhherCommand", "getUniques");
2360 /**************************************************************************************************/
2361 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2364 vector<string> duplicateNames(numUniques, "");
2365 for(int i=0;i<numSeqs;i++){
2366 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2370 m->openOutputFile(filename, nameFile);
2372 for(int i=0;i<numUniques;i++){
2374 if (m->control_pressed) { break; }
2376 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2377 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2384 catch(exception& e) {
2385 m->errorOut(e, "ShhherCommand", "createNamesFile");
2389 //**********************************************************************************************************************
2391 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2394 ReadMatrix* read = new ReadColumnMatrix(distFileName);
2395 read->setCutoff(cutoff);
2397 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2398 clusterNameMap->readMap();
2399 read->read(clusterNameMap);
2401 ListVector* list = read->getListVector();
2402 SparseMatrix* matrix = read->getMatrix();
2405 delete clusterNameMap;
2407 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2409 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
2410 string tag = cluster->getTag();
2412 double clusterCutoff = cutoff;
2413 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2415 if (m->control_pressed) { break; }
2417 cluster->update(clusterCutoff);
2420 list->setLabel(toString(cutoff));
2423 m->openOutputFile(filename, listFile);
2424 list->print(listFile);
2427 delete matrix; delete cluster; delete rabund; delete list;
2431 catch(exception& e) {
2432 m->errorOut(e, "ShhherCommand", "cluster");
2436 /**************************************************************************************************/
2438 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
2439 vector<int>& cumNumSeqs,
2440 vector<int>& nSeqsPerOTU,
2441 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2442 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2443 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
2444 vector<int>& seqIndex,
2445 map<string, int>& nameMap){
2449 m->openInputFile(fileName, listFile);
2453 listFile >> label >> numOTUs;
2455 otuData.assign(numSeqs, 0);
2456 cumNumSeqs.assign(numOTUs, 0);
2457 nSeqsPerOTU.assign(numOTUs, 0);
2458 aaP.clear();aaP.resize(numOTUs);
2464 string singleOTU = "";
2466 for(int i=0;i<numOTUs;i++){
2468 if (m->control_pressed) { break; }
2470 listFile >> singleOTU;
2472 istringstream otuString(singleOTU);
2476 string seqName = "";
2478 for(int j=0;j<singleOTU.length();j++){
2479 char letter = otuString.get();
2485 map<string,int>::iterator nmIt = nameMap.find(seqName);
2486 int index = nmIt->second;
2488 nameMap.erase(nmIt);
2492 aaP[i].push_back(index);
2497 map<string,int>::iterator nmIt = nameMap.find(seqName);
2499 int index = nmIt->second;
2500 nameMap.erase(nmIt);
2504 aaP[i].push_back(index);
2509 sort(aaP[i].begin(), aaP[i].end());
2510 for(int j=0;j<nSeqsPerOTU[i];j++){
2511 seqNumber.push_back(aaP[i][j]);
2513 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2514 aaP[i].push_back(0);
2520 for(int i=1;i<numOTUs;i++){
2521 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2524 seqIndex = seqNumber;
2531 catch(exception& e) {
2532 m->errorOut(e, "ShhherCommand", "getOTUData");
2536 /**************************************************************************************************/
2538 int ShhherCommand::calcCentroidsDriver(int numOTUs,
2539 vector<int>& cumNumSeqs,
2540 vector<int>& nSeqsPerOTU,
2541 vector<int>& seqIndex,
2542 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
2543 vector<int>& centroids, //the representative flowgram for each cluster m
2544 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2545 vector<int>& mapSeqToUnique,
2546 vector<short>& uniqueFlowgrams,
2547 vector<short>& flowDataIntI,
2548 vector<int>& lengths,
2550 vector<int>& seqNumber){
2552 //this function gets the most likely homopolymer length at a flow position for a group of sequences
2557 for(int i=0;i<numOTUs;i++){
2559 if (m->control_pressed) { break; }
2563 int minFlowGram = 100000000;
2564 double minFlowValue = 1e8;
2565 change[i] = 0; //FALSE
2567 for(int j=0;j<nSeqsPerOTU[i];j++){
2568 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2571 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2572 vector<double> adF(nSeqsPerOTU[i]);
2573 vector<int> anL(nSeqsPerOTU[i]);
2575 for(int j=0;j<nSeqsPerOTU[i];j++){
2576 int index = cumNumSeqs[i] + j;
2577 int nI = seqIndex[index];
2578 int nIU = mapSeqToUnique[nI];
2581 for(k=0;k<position;k++){
2587 anL[position] = nIU;
2588 adF[position] = 0.0000;
2593 for(int j=0;j<nSeqsPerOTU[i];j++){
2594 int index = cumNumSeqs[i] + j;
2595 int nI = seqIndex[index];
2597 double tauValue = singleTau[seqNumber[index]];
2599 for(int k=0;k<position;k++){
2600 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2601 adF[k] += dist * tauValue;
2605 for(int j=0;j<position;j++){
2606 if(adF[j] < minFlowValue){
2608 minFlowValue = adF[j];
2612 if(centroids[i] != anL[minFlowGram]){
2614 centroids[i] = anL[minFlowGram];
2617 else if(centroids[i] != -1){
2625 catch(exception& e) {
2626 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2630 /**************************************************************************************************/
2632 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2633 vector<short>& flowDataIntI, int numFlowCells){
2636 int flowAValue = cent * numFlowCells;
2637 int flowBValue = flow * numFlowCells;
2641 for(int i=0;i<length;i++){
2642 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2647 return dist / (double)length;
2649 catch(exception& e) {
2650 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2654 /**************************************************************************************************/
2656 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2659 double maxChange = 0;
2661 for(int i=0;i<numOTUs;i++){
2663 if (m->control_pressed) { break; }
2665 double difference = weight[i];
2668 for(int j=0;j<nSeqsPerOTU[i];j++){
2669 int index = cumNumSeqs[i] + j;
2670 double tauValue = singleTau[seqNumber[index]];
2671 weight[i] += tauValue;
2674 difference = fabs(weight[i] - difference);
2675 if(difference > maxChange){ maxChange = difference; }
2679 catch(exception& e) {
2680 m->errorOut(e, "ShhherCommand", "getNewWeights");
2685 /**************************************************************************************************/
2687 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2691 vector<long double> P(numSeqs, 0);
2694 for(int i=0;i<numOTUs;i++){
2695 if(weight[i] > MIN_WEIGHT){
2701 for(int i=0;i<numOTUs;i++){
2703 if (m->control_pressed) { break; }
2705 for(int j=0;j<nSeqsPerOTU[i];j++){
2706 int index = cumNumSeqs[i] + j;
2707 int nI = seqIndex[index];
2708 double singleDist = dist[seqNumber[index]];
2710 P[nI] += weight[i] * exp(-singleDist * sigma);
2714 for(int i=0;i<numSeqs;i++){
2715 if(P[i] == 0){ P[i] = DBL_EPSILON; }
2720 nLL = nLL -(double)numSeqs * log(sigma);
2724 catch(exception& e) {
2725 m->errorOut(e, "ShhherCommand", "getNewWeights");
2730 /**************************************************************************************************/
2732 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2734 vector<int> unique(numOTUs, 1);
2736 for(int i=0;i<numOTUs;i++){
2737 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2742 for(int i=0;i<numOTUs;i++){
2744 if (m->control_pressed) { break; }
2747 for(int j=i+1;j<numOTUs;j++){
2750 if(centroids[j] == centroids[i]){
2754 weight[i] += weight[j];
2764 catch(exception& e) {
2765 m->errorOut(e, "ShhherCommand", "checkCentroids");
2769 /**************************************************************************************************/
2771 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
2772 vector<double>& weight, vector<short>& change, vector<int>& centroids,
2773 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
2774 vector<int>& seqNumber, vector<int>& seqIndex,
2775 vector<short>& uniqueFlowgrams,
2776 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2781 vector<double> newTau(numOTUs,0);
2782 vector<double> norms(numSeqs, 0);
2783 nSeqsPerOTU.assign(numOTUs, 0);
2785 for(int i=0;i<numSeqs;i++){
2787 if (m->control_pressed) { break; }
2789 int indexOffset = i * numOTUs;
2791 double offset = 1e8;
2793 for(int j=0;j<numOTUs;j++){
2795 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2796 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2799 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2800 offset = dist[indexOffset + j];
2804 for(int j=0;j<numOTUs;j++){
2805 if(weight[j] > MIN_WEIGHT){
2806 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2807 norms[i] += newTau[j];
2814 for(int j=0;j<numOTUs;j++){
2815 newTau[j] /= norms[i];
2818 for(int j=0;j<numOTUs;j++){
2819 if(newTau[j] > MIN_TAU){
2821 int oldTotal = total;
2825 singleTau.resize(total, 0);
2826 seqNumber.resize(total, 0);
2827 seqIndex.resize(total, 0);
2829 singleTau[oldTotal] = newTau[j];
2831 aaP[j][nSeqsPerOTU[j]] = oldTotal;
2832 aaI[j][nSeqsPerOTU[j]] = i;
2840 catch(exception& e) {
2841 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2845 /**************************************************************************************************/
2847 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2850 for(int i=0;i<numOTUs;i++){
2852 if (m->control_pressed) { return 0; }
2854 cumNumSeqs[i] = index;
2855 for(int j=0;j<nSeqsPerOTU[i];j++){
2856 seqNumber[index] = aaP[i][j];
2857 seqIndex[index] = aaI[i][j];
2865 catch(exception& e) {
2866 m->errorOut(e, "ShhherCommand", "fill");
2870 /**************************************************************************************************/
2872 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2873 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2876 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2878 for(int i=0;i<numOTUs;i++){
2880 if (m->control_pressed) { break; }
2882 for(int j=0;j<nSeqsPerOTU[i];j++){
2883 int index = cumNumSeqs[i] + j;
2884 double tauValue = singleTau[seqNumber[index]];
2885 int sIndex = seqIndex[index];
2886 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
2890 for(int i=0;i<numSeqs;i++){
2891 double maxTau = -1.0000;
2893 for(int j=0;j<numOTUs;j++){
2894 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2895 maxTau = bigTauMatrix[i * numOTUs + j];
2900 otuData[i] = maxOTU;
2903 nSeqsPerOTU.assign(numOTUs, 0);
2905 for(int i=0;i<numSeqs;i++){
2906 int index = otuData[i];
2908 singleTau[i] = 1.0000;
2911 aaP[index][nSeqsPerOTU[index]] = i;
2912 aaI[index][nSeqsPerOTU[index]] = i;
2914 nSeqsPerOTU[index]++;
2917 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2919 catch(exception& e) {
2920 m->errorOut(e, "ShhherCommand", "setOTUs");
2924 /**************************************************************************************************/
2926 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2927 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2928 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2931 string thisOutputDir = outputDir;
2932 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
2933 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2935 ofstream qualityFile;
2936 m->openOutputFile(qualityFileName, qualityFile);
2938 qualityFile.setf(ios::fixed, ios::floatfield);
2939 qualityFile.setf(ios::showpoint);
2940 qualityFile << setprecision(6);
2942 vector<vector<int> > qualities(numOTUs);
2943 vector<double> pr(HOMOPS, 0);
2946 for(int i=0;i<numOTUs;i++){
2948 if (m->control_pressed) { break; }
2953 if(nSeqsPerOTU[i] > 0){
2954 qualities[i].assign(1024, -1);
2956 while(index < numFlowCells){
2957 double maxPrValue = 1e8;
2958 short maxPrIndex = -1;
2959 double count = 0.0000;
2961 pr.assign(HOMOPS, 0);
2963 for(int j=0;j<nSeqsPerOTU[i];j++){
2964 int lIndex = cumNumSeqs[i] + j;
2965 double tauValue = singleTau[seqNumber[lIndex]];
2966 int sequenceIndex = aaI[i][j];
2967 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
2971 for(int s=0;s<HOMOPS;s++){
2972 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
2976 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
2977 maxPrValue = pr[maxPrIndex];
2979 if(count > MIN_COUNT){
2981 double norm = 0.0000;
2983 for(int s=0;s<HOMOPS;s++){
2984 norm += exp(-(pr[s] - maxPrValue));
2987 for(int s=1;s<=maxPrIndex;s++){
2989 double temp = 0.0000;
2991 U += exp(-(pr[s-1]-maxPrValue))/norm;
2999 temp = floor(-10 * temp);
3000 value = (int)floor(temp);
3001 if(value > 100){ value = 100; }
3003 qualities[i][base] = (int)value;
3013 if(otuCounts[i] > 0){
3014 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3016 int j=4; //need to get past the first four bases
3017 while(qualities[i][j] != -1){
3018 qualityFile << qualities[i][j] << ' ';
3021 qualityFile << endl;
3024 qualityFile.close();
3025 outputNames.push_back(qualityFileName);
3028 catch(exception& e) {
3029 m->errorOut(e, "ShhherCommand", "writeQualities");
3034 /**************************************************************************************************/
3036 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3038 string thisOutputDir = outputDir;
3039 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3040 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3042 m->openOutputFile(fastaFileName, fastaFile);
3044 vector<string> names(numOTUs, "");
3046 for(int i=0;i<numOTUs;i++){
3048 if (m->control_pressed) { break; }
3050 int index = centroids[i];
3052 if(otuCounts[i] > 0){
3053 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3057 for(int j=0;j<numFlowCells;j++){
3059 char base = flowOrder[j % 4];
3060 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3065 fastaFile << newSeq.substr(4) << endl;
3070 outputNames.push_back(fastaFileName);
3072 if(thisCompositeFASTAFileName != ""){
3073 m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3076 catch(exception& e) {
3077 m->errorOut(e, "ShhherCommand", "writeSequences");
3082 /**************************************************************************************************/
3084 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3086 string thisOutputDir = outputDir;
3087 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3088 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3090 m->openOutputFile(nameFileName, nameFile);
3092 for(int i=0;i<numOTUs;i++){
3094 if (m->control_pressed) { break; }
3096 if(otuCounts[i] > 0){
3097 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3099 for(int j=1;j<nSeqsPerOTU[i];j++){
3100 nameFile << ',' << seqNameVector[aaI[i][j]];
3107 outputNames.push_back(nameFileName);
3110 if(thisCompositeNamesFileName != ""){
3111 m->appendFiles(nameFileName, thisCompositeNamesFileName);
3114 catch(exception& e) {
3115 m->errorOut(e, "ShhherCommand", "writeNames");
3120 /**************************************************************************************************/
3122 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3124 string thisOutputDir = outputDir;
3125 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3126 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3127 string groupFileName = fileRoot + "shhh.groups";
3129 m->openOutputFile(groupFileName, groupFile);
3131 for(int i=0;i<numSeqs;i++){
3132 if (m->control_pressed) { break; }
3133 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3136 outputNames.push_back(groupFileName);
3139 catch(exception& e) {
3140 m->errorOut(e, "ShhherCommand", "writeGroups");
3145 /**************************************************************************************************/
3147 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3149 string thisOutputDir = outputDir;
3150 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3151 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3152 ofstream otuCountsFile;
3153 m->openOutputFile(otuCountsFileName, otuCountsFile);
3155 string bases = flowOrder;
3157 for(int i=0;i<numOTUs;i++){
3159 if (m->control_pressed) {
3162 //output the translated version of the centroid sequence for the otu
3163 if(otuCounts[i] > 0){
3164 int index = centroids[i];
3166 otuCountsFile << "ideal\t";
3167 for(int j=8;j<numFlowCells;j++){
3168 char base = bases[j % 4];
3169 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3170 otuCountsFile << base;
3173 otuCountsFile << endl;
3175 for(int j=0;j<nSeqsPerOTU[i];j++){
3176 int sequence = aaI[i][j];
3177 otuCountsFile << seqNameVector[sequence] << '\t';
3181 for(int k=0;k<lengths[sequence];k++){
3182 char base = bases[k % 4];
3183 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3185 for(int s=0;s<freq;s++){
3187 //otuCountsFile << base;
3190 otuCountsFile << newSeq.substr(4) << endl;
3192 otuCountsFile << endl;
3195 otuCountsFile.close();
3196 outputNames.push_back(otuCountsFileName);
3199 catch(exception& e) {
3200 m->errorOut(e, "ShhherCommand", "writeClusters");
3205 /**************************************************************************************************/
3207 void ShhherCommand::getSingleLookUp(){
3209 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
3210 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3213 ifstream lookUpFile;
3214 m->openInputFile(lookupFileName, lookUpFile);
3216 for(int i=0;i<HOMOPS;i++){
3218 if (m->control_pressed) { break; }
3221 lookUpFile >> logFracFreq;
3223 for(int j=0;j<NUMBINS;j++) {
3224 lookUpFile >> singleLookUp[index];
3230 catch(exception& e) {
3231 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3236 /**************************************************************************************************/
3238 void ShhherCommand::getJointLookUp(){
3241 // the most likely joint probability (-log) that two intenities have the same polymer length
3242 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3244 for(int i=0;i<NUMBINS;i++){
3246 if (m->control_pressed) { break; }
3248 for(int j=0;j<NUMBINS;j++){
3250 double minSum = 100000000;
3252 for(int k=0;k<HOMOPS;k++){
3253 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3255 if(sum < minSum) { minSum = sum; }
3257 jointLookUp[i * NUMBINS + j] = minSum;
3261 catch(exception& e) {
3262 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3267 /**************************************************************************************************/
3269 double ShhherCommand::getProbIntensity(int intIntensity){
3272 double minNegLogProb = 100000000;
3275 for(int i=0;i<HOMOPS;i++){//loop signal strength
3277 if (m->control_pressed) { break; }
3279 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3280 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
3283 return minNegLogProb;
3285 catch(exception& e) {
3286 m->errorOut(e, "ShhherCommand", "getProbIntensity");