5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "shhhercommand.h"
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "ShhherCommand", "setParameters");
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
44 m->errorOut(e, "ShhherCommand", "getHelpString");
48 //**********************************************************************************************************************
50 ShhherCommand::ShhherCommand(){
52 abort = true; calledHelp = true;
55 //initialize outputTypes
56 // vector<string> tempOutNames;
57 // outputTypes["pn.dist"] = tempOutNames;
61 m->errorOut(e, "ShhherCommand", "ShhherCommand");
66 //**********************************************************************************************************************
68 ShhherCommand::ShhherCommand(string option) {
72 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
77 abort = false; calledHelp = false;
79 //allow user to run help
80 if(option == "help") { help(); abort = true; calledHelp = true; }
81 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
84 vector<string> myArray = setParameters();
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter;
90 map<string,string>::iterator it;
92 //check to make sure all parameters are valid for command
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
97 //initialize outputTypes
98 vector<string> tempOutNames;
99 // outputTypes["pn.dist"] = tempOutNames;
100 // outputTypes["fasta"] = tempOutNames;
102 //if the user changes the input directory command factory will send this info to us in the output parameter
103 string inputDir = validParameter.validFile(parameters, "inputdir", false);
104 if (inputDir == "not found"){ inputDir = ""; }
107 it = parameters.find("flow");
108 //user has given a template file
109 if(it != parameters.end()){
110 path = m->hasPath(it->second);
111 //if the user has not given a path then, add inputdir. else leave path alone.
112 if (path == "") { parameters["flow"] = inputDir + it->second; }
115 it = parameters.find("lookup");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["lookup"] = inputDir + it->second; }
123 it = parameters.find("file");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["file"] = inputDir + it->second; }
133 //check for required parameters
134 flowFileName = validParameter.validFile(parameters, "flow", true);
135 flowFilesFileName = validParameter.validFile(parameters, "file", true);
136 if (flowFileName == "not found" && flowFilesFileName == "not found") {
137 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
138 m->mothurOutEndLine();
141 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
143 if(flowFileName != "not found"){
144 compositeFASTAFileName = "";
145 compositeNamesFileName = "";
150 //flow.files = 9 character offset
151 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
152 m->openOutputFile(compositeFASTAFileName, temp);
155 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
156 m->openOutputFile(compositeNamesFileName, temp);
160 if(flowFilesFileName != "not found"){
163 ifstream flowFilesFile;
164 m->openInputFile(flowFilesFileName, flowFilesFile);
165 while(flowFilesFile){
166 fName = m->getline(flowFilesFile);
168 //test if file is valid
170 int ableToOpen = m->openInputFile(fName, in, "noerror");
172 if (ableToOpen == 1) {
173 if (inputDir != "") { //default path is set
174 string tryPath = inputDir + fName;
175 m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
177 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
183 if (ableToOpen == 1) {
184 if (m->getDefaultPath() != "") { //default path is set
185 string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
186 m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
188 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
194 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
195 if (ableToOpen == 1) {
196 string exepath = m->argv;
197 string tempPath = exepath;
198 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
199 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
201 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
202 m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
204 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
209 if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
210 else { flowFileVector.push_back(fName); }
211 m->gobble(flowFilesFile);
213 flowFilesFile.close();
214 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
217 flowFileVector.push_back(flowFileName);
221 //if the user changes the output directory command factory will send this info to us in the output parameter
222 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
224 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it
228 //check for optional parameter and set defaults
229 // ...at some point should added some additional type checking...
231 temp = validParameter.validFile(parameters, "lookup", true);
232 if (temp == "not found") {
233 lookupFileName = "LookUp_Titanium.pat";
237 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
240 //if you can't open it, try input location
241 if (ableToOpen == 1) {
242 if (inputDir != "") { //default path is set
243 string tryPath = inputDir + lookupFileName;
244 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
246 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248 lookupFileName = tryPath;
252 //if you can't open it, try default location
253 if (ableToOpen == 1) {
254 if (m->getDefaultPath() != "") { //default path is set
255 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
256 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
258 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
260 lookupFileName = tryPath;
264 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
265 if (ableToOpen == 1) {
266 string exepath = m->argv;
267 string tempPath = exepath;
268 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
269 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
271 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
272 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
274 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
276 lookupFileName = tryPath;
279 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
281 else if(temp == "not open") {
283 lookupFileName = validParameter.validFile(parameters, "lookup", false);
285 //if you can't open it its not inputDir, try mothur excutable location
286 string exepath = m->argv;
287 string tempPath = exepath;
288 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
289 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
291 string tryPath = m->getFullPathName(exepath) + lookupFileName;
292 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
294 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
296 lookupFileName = tryPath;
298 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
299 }else { lookupFileName = temp; }
301 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
302 m->setProcessors(temp);
303 m->mothurConvert(temp, processors);
305 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
306 m->mothurConvert(temp, cutoff);
308 temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
309 m->mothurConvert(temp, minDelta);
311 temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
312 m->mothurConvert(temp, maxIters);
314 temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
315 m->mothurConvert(temp, sigma);
317 flowOrder = validParameter.validFile(parameters, "order", false);
318 if (flowOrder == "not found"){ flowOrder = "TACG"; }
319 else if(flowOrder.length() != 4){
320 m->mothurOut("The value of the order option must be four bases long\n");
328 catch(exception& e) {
329 m->errorOut(e, "ShhherCommand", "ShhherCommand");
333 //**********************************************************************************************************************
335 int ShhherCommand::execute(){
337 if (abort == true) { if (calledHelp) { return 0; } return 2; }
344 for(int i=1;i<ncpus;i++){
345 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
347 if(abort == 1){ return 0; }
351 m->mothurOut("\nGetting preliminary data...\n");
352 getSingleLookUp(); if (m->control_pressed) { return 0; }
353 getJointLookUp(); if (m->control_pressed) { return 0; }
355 vector<string> flowFileVector;
356 if(flowFilesFileName != "not found"){
359 ifstream flowFilesFile;
360 m->openInputFile(flowFilesFileName, flowFilesFile);
361 while(flowFilesFile){
362 fName = m->getline(flowFilesFile);
363 flowFileVector.push_back(fName);
364 m->gobble(flowFilesFile);
368 flowFileVector.push_back(flowFileName);
371 int numFiles = flowFileVector.size();
373 for(int i=1;i<ncpus;i++){
374 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
377 for(int i=0;i<numFiles;i++){
379 if (m->control_pressed) { break; }
381 double begClock = clock();
382 unsigned long long begTime = time(NULL);
384 flowFileName = flowFileVector[i];
386 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
387 m->mothurOut("Reading flowgrams...\n");
390 if (m->control_pressed) { break; }
392 m->mothurOut("Identifying unique flowgrams...\n");
395 if (m->control_pressed) { break; }
397 m->mothurOut("Calculating distances between flowgrams...\n");
399 strcpy(fileName, flowFileName.c_str());
401 for(int i=1;i<ncpus;i++){
402 MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
404 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
405 MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
406 MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
407 MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
408 MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
409 MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
410 MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
411 MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
412 MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
413 MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
416 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
418 if (m->control_pressed) { break; }
421 for(int i=1;i<ncpus;i++){
422 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
424 m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
425 m->mothurRemove((distFileName + ".temp." + toString(i)));
428 string namesFileName = createNamesFile();
430 if (m->control_pressed) { break; }
432 m->mothurOut("\nClustering flowgrams...\n");
433 string listFileName = cluster(distFileName, namesFileName);
435 if (m->control_pressed) { break; }
437 getOTUData(listFileName);
439 m->mothurRemove(distFileName);
440 m->mothurRemove(namesFileName);
441 m->mothurRemove(listFileName);
443 if (m->control_pressed) { break; }
447 if (m->control_pressed) { break; }
449 for(int i=1;i<ncpus;i++){
450 MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
451 MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
452 MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
453 MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
456 if (m->control_pressed) { break; }
461 int numOTUsOnCPU = numOTUs / ncpus;
462 int numSeqsOnCPU = numSeqs / ncpus;
463 m->mothurOut("\nDenoising flowgrams...\n");
464 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
466 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
468 double cycClock = clock();
469 unsigned long long cycTime = time(NULL);
472 if (m->control_pressed) { break; }
474 int total = singleTau.size();
475 for(int i=1;i<ncpus;i++){
476 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
477 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
478 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
480 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
481 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
482 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
483 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
484 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
487 calcCentroidsDriver(0, numOTUsOnCPU);
489 for(int i=1;i<ncpus;i++){
490 int otuStart = i * numOTUs / ncpus;
491 int otuStop = (i + 1) * numOTUs / ncpus;
493 vector<int> tempCentroids(numOTUs, 0);
494 vector<short> tempChange(numOTUs, 0);
496 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
497 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
499 for(int j=otuStart;j<otuStop;j++){
500 centroids[j] = tempCentroids[j];
501 change[j] = tempChange[j];
505 maxDelta = getNewWeights(); if (m->control_pressed) { break; }
506 double nLL = getLikelihood(); if (m->control_pressed) { break; }
507 checkCentroids(); if (m->control_pressed) { break; }
509 for(int i=1;i<ncpus;i++){
510 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
511 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
512 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
515 calcNewDistancesParent(0, numSeqsOnCPU);
517 total = singleTau.size();
519 for(int i=1;i<ncpus;i++){
521 int seqStart = i * numSeqs / ncpus;
522 int seqStop = (i + 1) * numSeqs / ncpus;
524 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
526 vector<int> childSeqIndex(childTotal, 0);
527 vector<double> childSingleTau(childTotal, 0);
528 vector<double> childDist(numSeqs * numOTUs, 0);
529 vector<int> otuIndex(childTotal, 0);
531 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
532 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
533 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
534 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
536 int oldTotal = total;
538 singleTau.resize(total, 0);
539 seqIndex.resize(total, 0);
540 seqNumber.resize(total, 0);
544 for(int j=oldTotal;j<total;j++){
545 int otuI = otuIndex[childIndex];
546 int seqI = childSeqIndex[childIndex];
548 singleTau[j] = childSingleTau[childIndex];
550 aaP[otuI][nSeqsPerOTU[otuI]] = j;
551 aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
556 int index = seqStart * numOTUs;
557 for(int j=seqStart;j<seqStop;j++){
558 for(int k=0;k<numOTUs;k++){
559 dist[index] = childDist[index];
567 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
569 if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
571 for(int i=1;i<ncpus;i++){
572 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
577 for(int i=1;i<ncpus;i++){
578 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
584 if (m->control_pressed) { break; }
586 m->mothurOut("\nFinalizing...\n");
589 if (m->control_pressed) { break; }
593 vector<int> otuCounts(numOTUs, 0);
594 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
595 calcCentroidsDriver(0, numOTUs);
597 if (m->control_pressed) { break; }
599 writeQualities(otuCounts); if (m->control_pressed) { break; }
600 writeSequences(otuCounts); if (m->control_pressed) { break; }
601 writeNames(otuCounts); if (m->control_pressed) { break; }
602 writeClusters(otuCounts); if (m->control_pressed) { break; }
603 writeGroups(); if (m->control_pressed) { break; }
606 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
612 MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
613 if(abort){ return 0; }
616 MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
618 for(int i=0;i<numFiles;i++){
620 if (m->control_pressed) { break; }
622 //Now into the pyrodist part
626 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
627 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
628 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
631 flowDataIntI.resize(numSeqs * numFlowCells);
632 flowDataPrI.resize(numSeqs * numFlowCells);
633 mapUniqueToSeq.resize(numSeqs);
634 mapSeqToUnique.resize(numSeqs);
635 lengths.resize(numSeqs);
636 jointLookUp.resize(NUMBINS * NUMBINS);
638 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
639 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
640 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
642 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
644 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
646 flowFileName = string(fileName);
647 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
648 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
650 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
652 if (m->control_pressed) { break; }
655 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
657 //Now into the pyronoise part
658 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
660 singleLookUp.resize(HOMOPS * NUMBINS);
661 uniqueFlowgrams.resize(numUniques * numFlowCells);
662 weight.resize(numOTUs);
663 centroids.resize(numOTUs);
664 change.resize(numOTUs);
665 dist.assign(numOTUs * numSeqs, 0);
666 nSeqsPerOTU.resize(numOTUs);
667 cumNumSeqs.resize(numOTUs);
669 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
670 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
671 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
673 int startOTU = pid * numOTUs / ncpus;
674 int endOTU = (pid + 1) * numOTUs / ncpus;
676 int startSeq = pid * numSeqs / ncpus;
677 int endSeq = (pid + 1) * numSeqs /ncpus;
683 if (m->control_pressed) { break; }
685 MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686 singleTau.assign(total, 0.0000);
687 seqNumber.assign(total, 0);
688 seqIndex.assign(total, 0);
690 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
691 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692 MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
693 MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
694 MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
695 MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
696 MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698 calcCentroidsDriver(startOTU, endOTU);
700 MPI_Send(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
701 MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
703 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704 MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
707 vector<int> otuIndex(total, 0);
708 calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
709 total = otuIndex.size();
711 MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
712 MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
713 MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
714 MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
715 MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
717 MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
722 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
724 MPI_Barrier(MPI_COMM_WORLD);
727 if(compositeFASTAFileName != ""){
728 outputNames.push_back(compositeFASTAFileName);
729 outputNames.push_back(compositeNamesFileName);
732 m->mothurOutEndLine();
733 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
734 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
735 m->mothurOutEndLine();
740 catch(exception& e) {
741 m->errorOut(e, "ShhherCommand", "execute");
745 /**************************************************************************************************/
746 string ShhherCommand::createNamesFile(){
749 vector<string> duplicateNames(numUniques, "");
750 for(int i=0;i<numSeqs;i++){
751 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
754 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
757 m->openOutputFile(nameFileName, nameFile);
759 for(int i=0;i<numUniques;i++){
761 if (m->control_pressed) { break; }
763 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
764 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
770 catch(exception& e) {
771 m->errorOut(e, "ShhherCommand", "createNamesFile");
775 /**************************************************************************************************/
777 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
779 ostringstream outStream;
780 outStream.setf(ios::fixed, ios::floatfield);
781 outStream.setf(ios::dec, ios::basefield);
782 outStream.setf(ios::showpoint);
783 outStream.precision(6);
785 int begTime = time(NULL);
786 double begClock = clock();
788 for(int i=startSeq;i<stopSeq;i++){
790 if (m->control_pressed) { break; }
792 for(int j=0;j<i;j++){
793 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
795 if(flowDistance < 1e-6){
796 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
798 else if(flowDistance <= cutoff){
799 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
803 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
807 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
808 if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
810 if (m->control_pressed) { return fDistFileName; }
812 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
814 ofstream distFile(fDistFileName.c_str());
815 distFile << outStream.str();
818 return fDistFileName;
820 catch(exception& e) {
821 m->errorOut(e, "ShhherCommand", "flowDistMPI");
825 /**************************************************************************************************/
827 void ShhherCommand::getOTUData(string listFileName){
831 m->openInputFile(listFileName, listFile);
834 listFile >> label >> numOTUs;
836 otuData.assign(numSeqs, 0);
837 cumNumSeqs.assign(numOTUs, 0);
838 nSeqsPerOTU.assign(numOTUs, 0);
839 aaP.clear();aaP.resize(numOTUs);
845 string singleOTU = "";
847 for(int i=0;i<numOTUs;i++){
849 if (m->control_pressed) { break; }
851 listFile >> singleOTU;
853 istringstream otuString(singleOTU);
859 for(int j=0;j<singleOTU.length();j++){
860 char letter = otuString.get();
866 map<string,int>::iterator nmIt = nameMap.find(seqName);
867 int index = nmIt->second;
873 aaP[i].push_back(index);
878 map<string,int>::iterator nmIt = nameMap.find(seqName);
880 int index = nmIt->second;
885 aaP[i].push_back(index);
890 sort(aaP[i].begin(), aaP[i].end());
891 for(int j=0;j<nSeqsPerOTU[i];j++){
892 seqNumber.push_back(aaP[i][j]);
894 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
901 for(int i=1;i<numOTUs;i++){
902 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
905 seqIndex = seqNumber;
910 catch(exception& e) {
911 m->errorOut(e, "ShhherCommand", "getOTUData");
916 /**************************************************************************************************/
918 void ShhherCommand::initPyroCluster(){
920 if (numOTUs < processors) { processors = 1; }
922 dist.assign(numSeqs * numOTUs, 0);
923 change.assign(numOTUs, 1);
924 centroids.assign(numOTUs, -1);
925 weight.assign(numOTUs, 0);
926 singleTau.assign(numSeqs, 1.0);
928 nSeqsBreaks.assign(processors+1, 0);
929 nOTUsBreaks.assign(processors+1, 0);
932 for(int i=0;i<processors;i++){
933 nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
934 nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
936 nSeqsBreaks[processors] = numSeqs;
937 nOTUsBreaks[processors] = numOTUs;
939 catch(exception& e) {
940 m->errorOut(e, "ShhherCommand", "initPyroCluster");
945 /**************************************************************************************************/
947 void ShhherCommand::fill(){
950 for(int i=0;i<numOTUs;i++){
952 if (m->control_pressed) { break; }
954 cumNumSeqs[i] = index;
955 for(int j=0;j<nSeqsPerOTU[i];j++){
956 seqNumber[index] = aaP[i][j];
957 seqIndex[index] = aaI[i][j];
963 catch(exception& e) {
964 m->errorOut(e, "ShhherCommand", "fill");
969 /**************************************************************************************************/
971 void ShhherCommand::getFlowData(){
974 m->openInputFile(flowFileName, flowFile);
977 seqNameVector.clear();
979 flowDataIntI.clear();
983 int currentNumFlowCells;
987 flowFile >> numFlowCells;
988 int index = 0;//pcluster
989 while(!flowFile.eof()){
991 if (m->control_pressed) { break; }
993 flowFile >> seqName >> currentNumFlowCells;
994 lengths.push_back(currentNumFlowCells);
996 seqNameVector.push_back(seqName);
997 nameMap[seqName] = index++;//pcluster
999 for(int i=0;i<numFlowCells;i++){
1000 flowFile >> intensity;
1001 if(intensity > 9.99) { intensity = 9.99; }
1002 int intI = int(100 * intensity + 0.0001);
1003 flowDataIntI.push_back(intI);
1005 m->gobble(flowFile);
1009 numSeqs = seqNameVector.size();
1011 for(int i=0;i<numSeqs;i++){
1013 if (m->control_pressed) { break; }
1015 int iNumFlowCells = i * numFlowCells;
1016 for(int j=lengths[i];j<numFlowCells;j++){
1017 flowDataIntI[iNumFlowCells + j] = 0;
1022 catch(exception& e) {
1023 m->errorOut(e, "ShhherCommand", "getFlowData");
1027 /**************************************************************************************************/
1028 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1031 vector<double> newTau(numOTUs,0);
1032 vector<double> norms(numSeqs, 0);
1037 for(int i=startSeq;i<stopSeq;i++){
1039 if (m->control_pressed) { break; }
1041 double offset = 1e8;
1042 int indexOffset = i * numOTUs;
1044 for(int j=0;j<numOTUs;j++){
1046 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1047 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1049 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1050 offset = dist[indexOffset + j];
1054 for(int j=0;j<numOTUs;j++){
1055 if(weight[j] > MIN_WEIGHT){
1056 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1057 norms[i] += newTau[j];
1064 for(int j=0;j<numOTUs;j++){
1066 newTau[j] /= norms[i];
1068 if(newTau[j] > MIN_TAU){
1069 otuIndex.push_back(j);
1070 seqIndex.push_back(i);
1071 singleTau.push_back(newTau[j]);
1077 catch(exception& e) {
1078 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1083 /**************************************************************************************************/
1085 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1090 vector<double> newTau(numOTUs,0);
1091 vector<double> norms(numSeqs, 0);
1092 nSeqsPerOTU.assign(numOTUs, 0);
1094 for(int i=startSeq;i<stopSeq;i++){
1096 if (m->control_pressed) { break; }
1098 int indexOffset = i * numOTUs;
1100 double offset = 1e8;
1102 for(int j=0;j<numOTUs;j++){
1104 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1105 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1108 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1109 offset = dist[indexOffset + j];
1113 for(int j=0;j<numOTUs;j++){
1114 if(weight[j] > MIN_WEIGHT){
1115 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1116 norms[i] += newTau[j];
1123 for(int j=0;j<numOTUs;j++){
1124 newTau[j] /= norms[i];
1127 for(int j=0;j<numOTUs;j++){
1128 if(newTau[j] > MIN_TAU){
1130 int oldTotal = total;
1134 singleTau.resize(total, 0);
1135 seqNumber.resize(total, 0);
1136 seqIndex.resize(total, 0);
1138 singleTau[oldTotal] = newTau[j];
1140 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1141 aaI[j][nSeqsPerOTU[j]] = i;
1149 catch(exception& e) {
1150 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1155 /**************************************************************************************************/
1157 void ShhherCommand::setOTUs(){
1160 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1162 for(int i=0;i<numOTUs;i++){
1164 if (m->control_pressed) { break; }
1166 for(int j=0;j<nSeqsPerOTU[i];j++){
1167 int index = cumNumSeqs[i] + j;
1168 double tauValue = singleTau[seqNumber[index]];
1169 int sIndex = seqIndex[index];
1170 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1174 for(int i=0;i<numSeqs;i++){
1175 double maxTau = -1.0000;
1177 for(int j=0;j<numOTUs;j++){
1178 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1179 maxTau = bigTauMatrix[i * numOTUs + j];
1184 otuData[i] = maxOTU;
1187 nSeqsPerOTU.assign(numOTUs, 0);
1189 for(int i=0;i<numSeqs;i++){
1190 int index = otuData[i];
1192 singleTau[i] = 1.0000;
1195 aaP[index][nSeqsPerOTU[index]] = i;
1196 aaI[index][nSeqsPerOTU[index]] = i;
1198 nSeqsPerOTU[index]++;
1202 catch(exception& e) {
1203 m->errorOut(e, "ShhherCommand", "setOTUs");
1208 /**************************************************************************************************/
1210 void ShhherCommand::getUniques(){
1215 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1216 uniqueCount.assign(numSeqs, 0); // anWeights
1217 uniqueLengths.assign(numSeqs, 0);
1218 mapSeqToUnique.assign(numSeqs, -1);
1219 mapUniqueToSeq.assign(numSeqs, -1);
1221 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1223 for(int i=0;i<numSeqs;i++){
1225 if (m->control_pressed) { break; }
1229 vector<short> current(numFlowCells);
1230 for(int j=0;j<numFlowCells;j++){
1231 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1234 for(int j=0;j<numUniques;j++){
1235 int offset = j * numFlowCells;
1239 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
1240 else { shorterLength = uniqueLengths[j]; }
1242 for(int k=0;k<shorterLength;k++){
1243 if(current[k] != uniqueFlowgrams[offset + k]){
1250 mapSeqToUnique[i] = j;
1253 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
1259 if(index == numUniques){
1260 uniqueLengths[numUniques] = lengths[i];
1261 uniqueCount[numUniques] = 1;
1262 mapSeqToUnique[i] = numUniques;//anMap
1263 mapUniqueToSeq[numUniques] = i;//anF
1265 for(int k=0;k<numFlowCells;k++){
1266 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1267 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1273 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1274 uniqueLengths.resize(numUniques);
1276 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1277 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
1279 catch(exception& e) {
1280 m->errorOut(e, "ShhherCommand", "getUniques");
1285 /**************************************************************************************************/
1287 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1289 int minLength = lengths[mapSeqToUnique[seqA]];
1290 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
1292 int ANumFlowCells = seqA * numFlowCells;
1293 int BNumFlowCells = seqB * numFlowCells;
1297 for(int i=0;i<minLength;i++){
1299 if (m->control_pressed) { break; }
1301 int flowAIntI = flowDataIntI[ANumFlowCells + i];
1302 float flowAPrI = flowDataPrI[ANumFlowCells + i];
1304 int flowBIntI = flowDataIntI[BNumFlowCells + i];
1305 float flowBPrI = flowDataPrI[BNumFlowCells + i];
1306 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1309 dist /= (float) minLength;
1312 catch(exception& e) {
1313 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1318 //**********************************************************************************************************************/
1320 string ShhherCommand::cluster(string distFileName, string namesFileName){
1323 ReadMatrix* read = new ReadColumnMatrix(distFileName);
1324 read->setCutoff(cutoff);
1326 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1327 clusterNameMap->readMap();
1328 read->read(clusterNameMap);
1330 ListVector* list = read->getListVector();
1331 SparseMatrix* matrix = read->getMatrix();
1334 delete clusterNameMap;
1336 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1338 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
1339 string tag = cluster->getTag();
1341 double clusterCutoff = cutoff;
1342 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1344 if (m->control_pressed) { break; }
1346 cluster->update(clusterCutoff);
1349 list->setLabel(toString(cutoff));
1351 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1353 m->openOutputFile(listFileName, listFile);
1354 list->print(listFile);
1357 delete matrix; delete cluster; delete rabund; delete list;
1359 return listFileName;
1361 catch(exception& e) {
1362 m->errorOut(e, "ShhherCommand", "cluster");
1367 /**************************************************************************************************/
1369 void ShhherCommand::calcCentroidsDriver(int start, int finish){
1371 //this function gets the most likely homopolymer length at a flow position for a group of sequences
1376 for(int i=start;i<finish;i++){
1378 if (m->control_pressed) { break; }
1382 int minFlowGram = 100000000;
1383 double minFlowValue = 1e8;
1384 change[i] = 0; //FALSE
1386 for(int j=0;j<nSeqsPerOTU[i];j++){
1387 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1390 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1391 vector<double> adF(nSeqsPerOTU[i]);
1392 vector<int> anL(nSeqsPerOTU[i]);
1394 for(int j=0;j<nSeqsPerOTU[i];j++){
1395 int index = cumNumSeqs[i] + j;
1396 int nI = seqIndex[index];
1397 int nIU = mapSeqToUnique[nI];
1400 for(k=0;k<position;k++){
1406 anL[position] = nIU;
1407 adF[position] = 0.0000;
1412 for(int j=0;j<nSeqsPerOTU[i];j++){
1413 int index = cumNumSeqs[i] + j;
1414 int nI = seqIndex[index];
1416 double tauValue = singleTau[seqNumber[index]];
1418 for(int k=0;k<position;k++){
1419 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420 adF[k] += dist * tauValue;
1424 for(int j=0;j<position;j++){
1425 if(adF[j] < minFlowValue){
1427 minFlowValue = adF[j];
1431 if(centroids[i] != anL[minFlowGram]){
1433 centroids[i] = anL[minFlowGram];
1436 else if(centroids[i] != -1){
1442 catch(exception& e) {
1443 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1448 /**************************************************************************************************/
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1453 int flowAValue = cent * numFlowCells;
1454 int flowBValue = flow * numFlowCells;
1458 for(int i=0;i<length;i++){
1459 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1464 return dist / (double)length;
1466 catch(exception& e) {
1467 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1472 /**************************************************************************************************/
1474 double ShhherCommand::getNewWeights(){
1477 double maxChange = 0;
1479 for(int i=0;i<numOTUs;i++){
1481 if (m->control_pressed) { break; }
1483 double difference = weight[i];
1486 for(int j=0;j<nSeqsPerOTU[i];j++){
1487 int index = cumNumSeqs[i] + j;
1488 double tauValue = singleTau[seqNumber[index]];
1489 weight[i] += tauValue;
1492 difference = fabs(weight[i] - difference);
1493 if(difference > maxChange){ maxChange = difference; }
1497 catch(exception& e) {
1498 m->errorOut(e, "ShhherCommand", "getNewWeights");
1503 /**************************************************************************************************/
1505 double ShhherCommand::getLikelihood(){
1509 vector<long double> P(numSeqs, 0);
1512 for(int i=0;i<numOTUs;i++){
1513 if(weight[i] > MIN_WEIGHT){
1519 for(int i=0;i<numOTUs;i++){
1521 if (m->control_pressed) { break; }
1523 for(int j=0;j<nSeqsPerOTU[i];j++){
1524 int index = cumNumSeqs[i] + j;
1525 int nI = seqIndex[index];
1526 double singleDist = dist[seqNumber[index]];
1528 P[nI] += weight[i] * exp(-singleDist * sigma);
1532 for(int i=0;i<numSeqs;i++){
1533 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1538 nLL = nLL -(double)numSeqs * log(sigma);
1542 catch(exception& e) {
1543 m->errorOut(e, "ShhherCommand", "getNewWeights");
1548 /**************************************************************************************************/
1550 void ShhherCommand::checkCentroids(){
1552 vector<int> unique(numOTUs, 1);
1554 for(int i=0;i<numOTUs;i++){
1555 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1560 for(int i=0;i<numOTUs;i++){
1562 if (m->control_pressed) { break; }
1565 for(int j=i+1;j<numOTUs;j++){
1568 if(centroids[j] == centroids[i]){
1572 weight[i] += weight[j];
1580 catch(exception& e) {
1581 m->errorOut(e, "ShhherCommand", "checkCentroids");
1585 /**************************************************************************************************/
1589 void ShhherCommand::writeQualities(vector<int> otuCounts){
1592 string thisOutputDir = outputDir;
1593 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1594 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1596 ofstream qualityFile;
1597 m->openOutputFile(qualityFileName, qualityFile);
1599 qualityFile.setf(ios::fixed, ios::floatfield);
1600 qualityFile.setf(ios::showpoint);
1601 qualityFile << setprecision(6);
1603 vector<vector<int> > qualities(numOTUs);
1604 vector<double> pr(HOMOPS, 0);
1607 for(int i=0;i<numOTUs;i++){
1609 if (m->control_pressed) { break; }
1614 if(nSeqsPerOTU[i] > 0){
1615 qualities[i].assign(1024, -1);
1617 while(index < numFlowCells){
1618 double maxPrValue = 1e8;
1619 short maxPrIndex = -1;
1620 double count = 0.0000;
1622 pr.assign(HOMOPS, 0);
1624 for(int j=0;j<nSeqsPerOTU[i];j++){
1625 int lIndex = cumNumSeqs[i] + j;
1626 double tauValue = singleTau[seqNumber[lIndex]];
1627 int sequenceIndex = aaI[i][j];
1628 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1632 for(int s=0;s<HOMOPS;s++){
1633 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1637 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1638 maxPrValue = pr[maxPrIndex];
1640 if(count > MIN_COUNT){
1642 double norm = 0.0000;
1644 for(int s=0;s<HOMOPS;s++){
1645 norm += exp(-(pr[s] - maxPrValue));
1648 for(int s=1;s<=maxPrIndex;s++){
1650 double temp = 0.0000;
1652 U += exp(-(pr[s-1]-maxPrValue))/norm;
1660 temp = floor(-10 * temp);
1661 value = (int)floor(temp);
1662 if(value > 100){ value = 100; }
1664 qualities[i][base] = (int)value;
1674 if(otuCounts[i] > 0){
1675 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1677 int j=4; //need to get past the first four bases
1678 while(qualities[i][j] != -1){
1679 qualityFile << qualities[i][j] << ' ';
1682 qualityFile << endl;
1685 qualityFile.close();
1686 outputNames.push_back(qualityFileName);
1689 catch(exception& e) {
1690 m->errorOut(e, "ShhherCommand", "writeQualities");
1695 /**************************************************************************************************/
1697 void ShhherCommand::writeSequences(vector<int> otuCounts){
1699 string thisOutputDir = outputDir;
1700 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1701 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1703 m->openOutputFile(fastaFileName, fastaFile);
1705 vector<string> names(numOTUs, "");
1707 for(int i=0;i<numOTUs;i++){
1709 if (m->control_pressed) { break; }
1711 int index = centroids[i];
1713 if(otuCounts[i] > 0){
1714 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1718 for(int j=0;j<numFlowCells;j++){
1720 char base = flowOrder[j % 4];
1721 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1726 fastaFile << newSeq.substr(4) << endl;
1731 outputNames.push_back(fastaFileName);
1733 if(compositeFASTAFileName != ""){
1734 m->appendFiles(fastaFileName, compositeFASTAFileName);
1737 catch(exception& e) {
1738 m->errorOut(e, "ShhherCommand", "writeSequences");
1743 /**************************************************************************************************/
1745 void ShhherCommand::writeNames(vector<int> otuCounts){
1747 string thisOutputDir = outputDir;
1748 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1749 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1751 m->openOutputFile(nameFileName, nameFile);
1753 for(int i=0;i<numOTUs;i++){
1755 if (m->control_pressed) { break; }
1757 if(otuCounts[i] > 0){
1758 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1760 for(int j=1;j<nSeqsPerOTU[i];j++){
1761 nameFile << ',' << seqNameVector[aaI[i][j]];
1768 outputNames.push_back(nameFileName);
1771 if(compositeNamesFileName != ""){
1772 m->appendFiles(nameFileName, compositeNamesFileName);
1775 catch(exception& e) {
1776 m->errorOut(e, "ShhherCommand", "writeNames");
1781 /**************************************************************************************************/
1783 void ShhherCommand::writeGroups(){
1785 string thisOutputDir = outputDir;
1786 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1787 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1788 string groupFileName = fileRoot + "shhh.groups";
1790 m->openOutputFile(groupFileName, groupFile);
1792 for(int i=0;i<numSeqs;i++){
1793 if (m->control_pressed) { break; }
1794 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1797 outputNames.push_back(groupFileName);
1800 catch(exception& e) {
1801 m->errorOut(e, "ShhherCommand", "writeGroups");
1806 /**************************************************************************************************/
1808 void ShhherCommand::writeClusters(vector<int> otuCounts){
1810 string thisOutputDir = outputDir;
1811 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1812 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1813 ofstream otuCountsFile;
1814 m->openOutputFile(otuCountsFileName, otuCountsFile);
1816 string bases = flowOrder;
1818 for(int i=0;i<numOTUs;i++){
1820 if (m->control_pressed) {
1823 //output the translated version of the centroid sequence for the otu
1824 if(otuCounts[i] > 0){
1825 int index = centroids[i];
1827 otuCountsFile << "ideal\t";
1828 for(int j=8;j<numFlowCells;j++){
1829 char base = bases[j % 4];
1830 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1831 otuCountsFile << base;
1834 otuCountsFile << endl;
1836 for(int j=0;j<nSeqsPerOTU[i];j++){
1837 int sequence = aaI[i][j];
1838 otuCountsFile << seqNameVector[sequence] << '\t';
1842 for(int k=0;k<lengths[sequence];k++){
1843 char base = bases[k % 4];
1844 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1846 for(int s=0;s<freq;s++){
1848 //otuCountsFile << base;
1851 otuCountsFile << newSeq.substr(4) << endl;
1853 otuCountsFile << endl;
1856 otuCountsFile.close();
1857 outputNames.push_back(otuCountsFileName);
1860 catch(exception& e) {
1861 m->errorOut(e, "ShhherCommand", "writeClusters");
1867 //**********************************************************************************************************************
1869 int ShhherCommand::execute(){
1871 if (abort == true) { return 0; }
1873 getSingleLookUp(); if (m->control_pressed) { return 0; }
1874 getJointLookUp(); if (m->control_pressed) { return 0; }
1876 int numFiles = flowFileVector.size();
1878 if (numFiles < processors) { processors = numFiles; }
1880 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1881 if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1882 else { createProcesses(flowFileVector); } //each processor processes one file
1884 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1887 if(compositeFASTAFileName != ""){
1888 outputNames.push_back(compositeFASTAFileName);
1889 outputNames.push_back(compositeNamesFileName);
1892 m->mothurOutEndLine();
1893 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1894 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
1895 m->mothurOutEndLine();
1899 catch(exception& e) {
1900 m->errorOut(e, "ShhherCommand", "execute");
1905 /**************************************************************************************************/
1907 int ShhherCommand::createProcesses(vector<string> filenames){
1909 vector<int> processIDS;
1914 if (filenames.size() < processors) { processors = filenames.size(); }
1916 //divide the groups between the processors
1917 vector<linePair> lines;
1918 int numFilesPerProcessor = filenames.size() / processors;
1919 for (int i = 0; i < processors; i++) {
1920 int startIndex = i * numFilesPerProcessor;
1921 int endIndex = (i+1) * numFilesPerProcessor;
1922 if(i == (processors - 1)){ endIndex = filenames.size(); }
1923 lines.push_back(linePair(startIndex, endIndex));
1926 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1928 //loop through and create all the processes you want
1929 while (process != processors) {
1933 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1935 }else if (pid == 0){
1936 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1939 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1940 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1946 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1948 //force parent to wait until all the processes are done
1949 for (int i=0;i<processIDS.size();i++) {
1950 int temp = processIDS[i];
1956 //////////////////////////////////////////////////////////////////////////////////////////////////////
1958 /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1960 //////////////////////////////////////////////////////////////////////////////////////////////////////
1961 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
1962 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1963 //////////////////////////////////////////////////////////////////////////////////////////////////////
1965 vector<shhhFlowsData*> pDataArray;
1966 DWORD dwThreadIdArray[processors-1];
1967 HANDLE hThreadArray[processors-1];
1969 //Create processor worker threads.
1970 for( int i=0; i<processors-1; i++ ){
1971 // Allocate memory for thread data.
1972 string extension = "";
1973 if (i != 0) { extension = toString(i) + ".temp"; }
1975 shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1976 pDataArray.push_back(tempFlow);
1977 processIDS.push_back(i);
1979 hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1982 //using the main process as a worker saves time and memory
1984 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1986 //Wait until all threads have terminated.
1987 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1989 //Close all thread handles and free memory allocations.
1990 for(int i=0; i < pDataArray.size(); i++){
1991 for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1992 CloseHandle(hThreadArray[i]);
1993 delete pDataArray[i];
1998 for (int i=0;i<processIDS.size();i++) {
1999 if (compositeFASTAFileName != "") {
2000 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2001 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2002 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2003 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2010 catch(exception& e) {
2011 m->errorOut(e, "ShhherCommand", "createProcesses");
2015 /**************************************************************************************************/
2017 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2020 for(int i=start;i<end;i++){
2022 if (m->control_pressed) { break; }
2024 string flowFileName = filenames[i];
2026 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2027 m->mothurOut("Reading flowgrams...\n");
2029 vector<string> seqNameVector;
2030 vector<int> lengths;
2031 vector<short> flowDataIntI;
2032 vector<double> flowDataPrI;
2033 map<string, int> nameMap;
2034 vector<short> uniqueFlowgrams;
2035 vector<int> uniqueCount;
2036 vector<int> mapSeqToUnique;
2037 vector<int> mapUniqueToSeq;
2038 vector<int> uniqueLengths;
2041 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2043 if (m->control_pressed) { break; }
2045 m->mothurOut("Identifying unique flowgrams...\n");
2046 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2048 if (m->control_pressed) { break; }
2050 m->mothurOut("Calculating distances between flowgrams...\n");
2051 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2052 unsigned long long begTime = time(NULL);
2053 double begClock = clock();
2055 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2057 m->mothurOutEndLine();
2058 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2061 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2062 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2064 if (m->control_pressed) { break; }
2066 m->mothurOut("\nClustering flowgrams...\n");
2067 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2068 cluster(listFileName, distFileName, namesFileName);
2070 if (m->control_pressed) { break; }
2072 vector<int> otuData;
2073 vector<int> cumNumSeqs;
2074 vector<int> nSeqsPerOTU;
2075 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2076 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2077 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
2078 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
2081 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2083 if (m->control_pressed) { break; }
2085 m->mothurRemove(distFileName);
2086 m->mothurRemove(namesFileName);
2087 m->mothurRemove(listFileName);
2089 vector<double> dist; //adDist - distance of sequences to centroids
2090 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
2091 vector<int> centroids; //the representative flowgram for each cluster m
2092 vector<double> weight;
2093 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2094 vector<int> nSeqsBreaks;
2095 vector<int> nOTUsBreaks;
2097 dist.assign(numSeqs * numOTUs, 0);
2098 change.assign(numOTUs, 1);
2099 centroids.assign(numOTUs, -1);
2100 weight.assign(numOTUs, 0);
2101 singleTau.assign(numSeqs, 1.0);
2103 nSeqsBreaks.assign(2, 0);
2104 nOTUsBreaks.assign(2, 0);
2107 nSeqsBreaks[1] = numSeqs;
2108 nOTUsBreaks[1] = numOTUs;
2110 if (m->control_pressed) { break; }
2112 double maxDelta = 0;
2116 begTime = time(NULL);
2118 m->mothurOut("\nDenoising flowgrams...\n");
2119 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2121 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2123 if (m->control_pressed) { break; }
2125 double cycClock = clock();
2126 unsigned long long cycTime = time(NULL);
2127 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2129 if (m->control_pressed) { break; }
2131 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2133 if (m->control_pressed) { break; }
2135 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
2137 if (m->control_pressed) { break; }
2139 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
2141 if (m->control_pressed) { break; }
2143 checkCentroids(numOTUs, centroids, weight);
2145 if (m->control_pressed) { break; }
2147 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2149 if (m->control_pressed) { break; }
2153 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2157 if (m->control_pressed) { break; }
2159 m->mothurOut("\nFinalizing...\n");
2160 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2162 if (m->control_pressed) { break; }
2164 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2166 if (m->control_pressed) { break; }
2168 vector<int> otuCounts(numOTUs, 0);
2169 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
2171 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2173 if (m->control_pressed) { break; }
2175 writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2176 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2177 writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
2178 writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
2179 writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; }
2181 m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2184 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2188 }catch(exception& e) {
2189 m->errorOut(e, "ShhherCommand", "driver");
2194 /**************************************************************************************************/
2195 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2200 m->openInputFile(filename, flowFile);
2203 int currentNumFlowCells;
2205 thisSeqNameVector.clear();
2206 thisLengths.clear();
2207 thisFlowDataIntI.clear();
2208 thisNameMap.clear();
2210 flowFile >> numFlowCells;
2211 int index = 0;//pcluster
2212 while(!flowFile.eof()){
2214 if (m->control_pressed) { break; }
2216 flowFile >> seqName >> currentNumFlowCells;
2217 thisLengths.push_back(currentNumFlowCells);
2219 thisSeqNameVector.push_back(seqName);
2220 thisNameMap[seqName] = index++;//pcluster
2222 for(int i=0;i<numFlowCells;i++){
2223 flowFile >> intensity;
2224 if(intensity > 9.99) { intensity = 9.99; }
2225 int intI = int(100 * intensity + 0.0001);
2226 thisFlowDataIntI.push_back(intI);
2228 m->gobble(flowFile);
2232 int numSeqs = thisSeqNameVector.size();
2234 for(int i=0;i<numSeqs;i++){
2236 if (m->control_pressed) { break; }
2238 int iNumFlowCells = i * numFlowCells;
2239 for(int j=thisLengths[i];j<numFlowCells;j++){
2240 thisFlowDataIntI[iNumFlowCells + j] = 0;
2247 catch(exception& e) {
2248 m->errorOut(e, "ShhherCommand", "getFlowData");
2252 /**************************************************************************************************/
2254 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2257 ostringstream outStream;
2258 outStream.setf(ios::fixed, ios::floatfield);
2259 outStream.setf(ios::dec, ios::basefield);
2260 outStream.setf(ios::showpoint);
2261 outStream.precision(6);
2263 int begTime = time(NULL);
2264 double begClock = clock();
2266 for(int i=0;i<stopSeq;i++){
2268 if (m->control_pressed) { break; }
2270 for(int j=0;j<i;j++){
2271 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2273 if(flowDistance < 1e-6){
2274 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2276 else if(flowDistance <= cutoff){
2277 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2281 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2282 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2283 m->mothurOutEndLine();
2287 ofstream distFile(distFileName.c_str());
2288 distFile << outStream.str();
2291 if (m->control_pressed) {}
2293 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2294 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2295 m->mothurOutEndLine();
2300 catch(exception& e) {
2301 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2305 /**************************************************************************************************/
2307 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2309 int minLength = lengths[mapSeqToUnique[seqA]];
2310 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
2312 int ANumFlowCells = seqA * numFlowCells;
2313 int BNumFlowCells = seqB * numFlowCells;
2317 for(int i=0;i<minLength;i++){
2319 if (m->control_pressed) { break; }
2321 int flowAIntI = flowDataIntI[ANumFlowCells + i];
2322 float flowAPrI = flowDataPrI[ANumFlowCells + i];
2324 int flowBIntI = flowDataIntI[BNumFlowCells + i];
2325 float flowBPrI = flowDataPrI[BNumFlowCells + i];
2326 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2329 dist /= (float) minLength;
2332 catch(exception& e) {
2333 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2338 /**************************************************************************************************/
2340 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2343 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2344 uniqueCount.assign(numSeqs, 0); // anWeights
2345 uniqueLengths.assign(numSeqs, 0);
2346 mapSeqToUnique.assign(numSeqs, -1);
2347 mapUniqueToSeq.assign(numSeqs, -1);
2349 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2351 for(int i=0;i<numSeqs;i++){
2353 if (m->control_pressed) { break; }
2357 vector<short> current(numFlowCells);
2358 for(int j=0;j<numFlowCells;j++){
2359 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2362 for(int j=0;j<numUniques;j++){
2363 int offset = j * numFlowCells;
2367 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
2368 else { shorterLength = uniqueLengths[j]; }
2370 for(int k=0;k<shorterLength;k++){
2371 if(current[k] != uniqueFlowgrams[offset + k]){
2378 mapSeqToUnique[i] = j;
2381 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
2387 if(index == numUniques){
2388 uniqueLengths[numUniques] = lengths[i];
2389 uniqueCount[numUniques] = 1;
2390 mapSeqToUnique[i] = numUniques;//anMap
2391 mapUniqueToSeq[numUniques] = i;//anF
2393 for(int k=0;k<numFlowCells;k++){
2394 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2395 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2401 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2402 uniqueLengths.resize(numUniques);
2404 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2405 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
2409 catch(exception& e) {
2410 m->errorOut(e, "ShhherCommand", "getUniques");
2414 /**************************************************************************************************/
2415 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2418 vector<string> duplicateNames(numUniques, "");
2419 for(int i=0;i<numSeqs;i++){
2420 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2424 m->openOutputFile(filename, nameFile);
2426 for(int i=0;i<numUniques;i++){
2428 if (m->control_pressed) { break; }
2430 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2431 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2438 catch(exception& e) {
2439 m->errorOut(e, "ShhherCommand", "createNamesFile");
2443 //**********************************************************************************************************************
2445 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2448 ReadMatrix* read = new ReadColumnMatrix(distFileName);
2449 read->setCutoff(cutoff);
2451 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2452 clusterNameMap->readMap();
2453 read->read(clusterNameMap);
2455 ListVector* list = read->getListVector();
2456 SparseMatrix* matrix = read->getMatrix();
2459 delete clusterNameMap;
2461 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2463 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
2464 string tag = cluster->getTag();
2466 double clusterCutoff = cutoff;
2467 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2469 if (m->control_pressed) { break; }
2471 cluster->update(clusterCutoff);
2474 list->setLabel(toString(cutoff));
2477 m->openOutputFile(filename, listFile);
2478 list->print(listFile);
2481 delete matrix; delete cluster; delete rabund; delete list;
2485 catch(exception& e) {
2486 m->errorOut(e, "ShhherCommand", "cluster");
2490 /**************************************************************************************************/
2492 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
2493 vector<int>& cumNumSeqs,
2494 vector<int>& nSeqsPerOTU,
2495 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2496 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2497 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
2498 vector<int>& seqIndex,
2499 map<string, int>& nameMap){
2503 m->openInputFile(fileName, listFile);
2507 listFile >> label >> numOTUs;
2509 otuData.assign(numSeqs, 0);
2510 cumNumSeqs.assign(numOTUs, 0);
2511 nSeqsPerOTU.assign(numOTUs, 0);
2512 aaP.clear();aaP.resize(numOTUs);
2518 string singleOTU = "";
2520 for(int i=0;i<numOTUs;i++){
2522 if (m->control_pressed) { break; }
2524 listFile >> singleOTU;
2526 istringstream otuString(singleOTU);
2530 string seqName = "";
2532 for(int j=0;j<singleOTU.length();j++){
2533 char letter = otuString.get();
2539 map<string,int>::iterator nmIt = nameMap.find(seqName);
2540 int index = nmIt->second;
2542 nameMap.erase(nmIt);
2546 aaP[i].push_back(index);
2551 map<string,int>::iterator nmIt = nameMap.find(seqName);
2553 int index = nmIt->second;
2554 nameMap.erase(nmIt);
2558 aaP[i].push_back(index);
2563 sort(aaP[i].begin(), aaP[i].end());
2564 for(int j=0;j<nSeqsPerOTU[i];j++){
2565 seqNumber.push_back(aaP[i][j]);
2567 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2568 aaP[i].push_back(0);
2574 for(int i=1;i<numOTUs;i++){
2575 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2578 seqIndex = seqNumber;
2585 catch(exception& e) {
2586 m->errorOut(e, "ShhherCommand", "getOTUData");
2590 /**************************************************************************************************/
2592 int ShhherCommand::calcCentroidsDriver(int numOTUs,
2593 vector<int>& cumNumSeqs,
2594 vector<int>& nSeqsPerOTU,
2595 vector<int>& seqIndex,
2596 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
2597 vector<int>& centroids, //the representative flowgram for each cluster m
2598 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2599 vector<int>& mapSeqToUnique,
2600 vector<short>& uniqueFlowgrams,
2601 vector<short>& flowDataIntI,
2602 vector<int>& lengths,
2604 vector<int>& seqNumber){
2606 //this function gets the most likely homopolymer length at a flow position for a group of sequences
2611 for(int i=0;i<numOTUs;i++){
2613 if (m->control_pressed) { break; }
2617 int minFlowGram = 100000000;
2618 double minFlowValue = 1e8;
2619 change[i] = 0; //FALSE
2621 for(int j=0;j<nSeqsPerOTU[i];j++){
2622 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2625 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2626 vector<double> adF(nSeqsPerOTU[i]);
2627 vector<int> anL(nSeqsPerOTU[i]);
2629 for(int j=0;j<nSeqsPerOTU[i];j++){
2630 int index = cumNumSeqs[i] + j;
2631 int nI = seqIndex[index];
2632 int nIU = mapSeqToUnique[nI];
2635 for(k=0;k<position;k++){
2641 anL[position] = nIU;
2642 adF[position] = 0.0000;
2647 for(int j=0;j<nSeqsPerOTU[i];j++){
2648 int index = cumNumSeqs[i] + j;
2649 int nI = seqIndex[index];
2651 double tauValue = singleTau[seqNumber[index]];
2653 for(int k=0;k<position;k++){
2654 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2655 adF[k] += dist * tauValue;
2659 for(int j=0;j<position;j++){
2660 if(adF[j] < minFlowValue){
2662 minFlowValue = adF[j];
2666 if(centroids[i] != anL[minFlowGram]){
2668 centroids[i] = anL[minFlowGram];
2671 else if(centroids[i] != -1){
2679 catch(exception& e) {
2680 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2684 /**************************************************************************************************/
2686 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2687 vector<short>& flowDataIntI, int numFlowCells){
2690 int flowAValue = cent * numFlowCells;
2691 int flowBValue = flow * numFlowCells;
2695 for(int i=0;i<length;i++){
2696 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2701 return dist / (double)length;
2703 catch(exception& e) {
2704 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2708 /**************************************************************************************************/
2710 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2713 double maxChange = 0;
2715 for(int i=0;i<numOTUs;i++){
2717 if (m->control_pressed) { break; }
2719 double difference = weight[i];
2722 for(int j=0;j<nSeqsPerOTU[i];j++){
2723 int index = cumNumSeqs[i] + j;
2724 double tauValue = singleTau[seqNumber[index]];
2725 weight[i] += tauValue;
2728 difference = fabs(weight[i] - difference);
2729 if(difference > maxChange){ maxChange = difference; }
2733 catch(exception& e) {
2734 m->errorOut(e, "ShhherCommand", "getNewWeights");
2739 /**************************************************************************************************/
2741 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2745 vector<long double> P(numSeqs, 0);
2748 for(int i=0;i<numOTUs;i++){
2749 if(weight[i] > MIN_WEIGHT){
2755 for(int i=0;i<numOTUs;i++){
2757 if (m->control_pressed) { break; }
2759 for(int j=0;j<nSeqsPerOTU[i];j++){
2760 int index = cumNumSeqs[i] + j;
2761 int nI = seqIndex[index];
2762 double singleDist = dist[seqNumber[index]];
2764 P[nI] += weight[i] * exp(-singleDist * sigma);
2768 for(int i=0;i<numSeqs;i++){
2769 if(P[i] == 0){ P[i] = DBL_EPSILON; }
2774 nLL = nLL -(double)numSeqs * log(sigma);
2778 catch(exception& e) {
2779 m->errorOut(e, "ShhherCommand", "getNewWeights");
2784 /**************************************************************************************************/
2786 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2788 vector<int> unique(numOTUs, 1);
2790 for(int i=0;i<numOTUs;i++){
2791 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2796 for(int i=0;i<numOTUs;i++){
2798 if (m->control_pressed) { break; }
2801 for(int j=i+1;j<numOTUs;j++){
2804 if(centroids[j] == centroids[i]){
2808 weight[i] += weight[j];
2818 catch(exception& e) {
2819 m->errorOut(e, "ShhherCommand", "checkCentroids");
2823 /**************************************************************************************************/
2825 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
2826 vector<double>& weight, vector<short>& change, vector<int>& centroids,
2827 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
2828 vector<int>& seqNumber, vector<int>& seqIndex,
2829 vector<short>& uniqueFlowgrams,
2830 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2835 vector<double> newTau(numOTUs,0);
2836 vector<double> norms(numSeqs, 0);
2837 nSeqsPerOTU.assign(numOTUs, 0);
2839 for(int i=0;i<numSeqs;i++){
2841 if (m->control_pressed) { break; }
2843 int indexOffset = i * numOTUs;
2845 double offset = 1e8;
2847 for(int j=0;j<numOTUs;j++){
2849 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2850 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2853 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2854 offset = dist[indexOffset + j];
2858 for(int j=0;j<numOTUs;j++){
2859 if(weight[j] > MIN_WEIGHT){
2860 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2861 norms[i] += newTau[j];
2868 for(int j=0;j<numOTUs;j++){
2869 newTau[j] /= norms[i];
2872 for(int j=0;j<numOTUs;j++){
2873 if(newTau[j] > MIN_TAU){
2875 int oldTotal = total;
2879 singleTau.resize(total, 0);
2880 seqNumber.resize(total, 0);
2881 seqIndex.resize(total, 0);
2883 singleTau[oldTotal] = newTau[j];
2885 aaP[j][nSeqsPerOTU[j]] = oldTotal;
2886 aaI[j][nSeqsPerOTU[j]] = i;
2894 catch(exception& e) {
2895 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2899 /**************************************************************************************************/
2901 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2904 for(int i=0;i<numOTUs;i++){
2906 if (m->control_pressed) { return 0; }
2908 cumNumSeqs[i] = index;
2909 for(int j=0;j<nSeqsPerOTU[i];j++){
2910 seqNumber[index] = aaP[i][j];
2911 seqIndex[index] = aaI[i][j];
2919 catch(exception& e) {
2920 m->errorOut(e, "ShhherCommand", "fill");
2924 /**************************************************************************************************/
2926 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2927 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2930 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2932 for(int i=0;i<numOTUs;i++){
2934 if (m->control_pressed) { break; }
2936 for(int j=0;j<nSeqsPerOTU[i];j++){
2937 int index = cumNumSeqs[i] + j;
2938 double tauValue = singleTau[seqNumber[index]];
2939 int sIndex = seqIndex[index];
2940 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
2944 for(int i=0;i<numSeqs;i++){
2945 double maxTau = -1.0000;
2947 for(int j=0;j<numOTUs;j++){
2948 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2949 maxTau = bigTauMatrix[i * numOTUs + j];
2954 otuData[i] = maxOTU;
2957 nSeqsPerOTU.assign(numOTUs, 0);
2959 for(int i=0;i<numSeqs;i++){
2960 int index = otuData[i];
2962 singleTau[i] = 1.0000;
2965 aaP[index][nSeqsPerOTU[index]] = i;
2966 aaI[index][nSeqsPerOTU[index]] = i;
2968 nSeqsPerOTU[index]++;
2971 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2973 catch(exception& e) {
2974 m->errorOut(e, "ShhherCommand", "setOTUs");
2978 /**************************************************************************************************/
2980 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2981 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2982 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2985 string thisOutputDir = outputDir;
2986 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
2987 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2989 ofstream qualityFile;
2990 m->openOutputFile(qualityFileName, qualityFile);
2992 qualityFile.setf(ios::fixed, ios::floatfield);
2993 qualityFile.setf(ios::showpoint);
2994 qualityFile << setprecision(6);
2996 vector<vector<int> > qualities(numOTUs);
2997 vector<double> pr(HOMOPS, 0);
3000 for(int i=0;i<numOTUs;i++){
3002 if (m->control_pressed) { break; }
3007 if(nSeqsPerOTU[i] > 0){
3008 qualities[i].assign(1024, -1);
3010 while(index < numFlowCells){
3011 double maxPrValue = 1e8;
3012 short maxPrIndex = -1;
3013 double count = 0.0000;
3015 pr.assign(HOMOPS, 0);
3017 for(int j=0;j<nSeqsPerOTU[i];j++){
3018 int lIndex = cumNumSeqs[i] + j;
3019 double tauValue = singleTau[seqNumber[lIndex]];
3020 int sequenceIndex = aaI[i][j];
3021 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3025 for(int s=0;s<HOMOPS;s++){
3026 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3030 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3031 maxPrValue = pr[maxPrIndex];
3033 if(count > MIN_COUNT){
3035 double norm = 0.0000;
3037 for(int s=0;s<HOMOPS;s++){
3038 norm += exp(-(pr[s] - maxPrValue));
3041 for(int s=1;s<=maxPrIndex;s++){
3043 double temp = 0.0000;
3045 U += exp(-(pr[s-1]-maxPrValue))/norm;
3053 temp = floor(-10 * temp);
3054 value = (int)floor(temp);
3055 if(value > 100){ value = 100; }
3057 qualities[i][base] = (int)value;
3067 if(otuCounts[i] > 0){
3068 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3070 int j=4; //need to get past the first four bases
3071 while(qualities[i][j] != -1){
3072 qualityFile << qualities[i][j] << ' ';
3073 if (j > qualities[i].size()) { break; }
3076 qualityFile << endl;
3079 qualityFile.close();
3080 outputNames.push_back(qualityFileName);
3083 catch(exception& e) {
3084 m->errorOut(e, "ShhherCommand", "writeQualities");
3089 /**************************************************************************************************/
3091 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3093 string thisOutputDir = outputDir;
3094 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3095 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3097 m->openOutputFile(fastaFileName, fastaFile);
3099 vector<string> names(numOTUs, "");
3101 for(int i=0;i<numOTUs;i++){
3103 if (m->control_pressed) { break; }
3105 int index = centroids[i];
3107 if(otuCounts[i] > 0){
3108 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3112 for(int j=0;j<numFlowCells;j++){
3114 char base = flowOrder[j % 4];
3115 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3120 fastaFile << newSeq.substr(4) << endl;
3125 outputNames.push_back(fastaFileName);
3127 if(thisCompositeFASTAFileName != ""){
3128 m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3131 catch(exception& e) {
3132 m->errorOut(e, "ShhherCommand", "writeSequences");
3137 /**************************************************************************************************/
3139 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3141 string thisOutputDir = outputDir;
3142 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3143 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3145 m->openOutputFile(nameFileName, nameFile);
3147 for(int i=0;i<numOTUs;i++){
3149 if (m->control_pressed) { break; }
3151 if(otuCounts[i] > 0){
3152 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3154 for(int j=1;j<nSeqsPerOTU[i];j++){
3155 nameFile << ',' << seqNameVector[aaI[i][j]];
3162 outputNames.push_back(nameFileName);
3165 if(thisCompositeNamesFileName != ""){
3166 m->appendFiles(nameFileName, thisCompositeNamesFileName);
3169 catch(exception& e) {
3170 m->errorOut(e, "ShhherCommand", "writeNames");
3175 /**************************************************************************************************/
3177 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3179 string thisOutputDir = outputDir;
3180 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3181 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3182 string groupFileName = fileRoot + "shhh.groups";
3184 m->openOutputFile(groupFileName, groupFile);
3186 for(int i=0;i<numSeqs;i++){
3187 if (m->control_pressed) { break; }
3188 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3191 outputNames.push_back(groupFileName);
3194 catch(exception& e) {
3195 m->errorOut(e, "ShhherCommand", "writeGroups");
3200 /**************************************************************************************************/
3202 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3204 string thisOutputDir = outputDir;
3205 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3206 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3207 ofstream otuCountsFile;
3208 m->openOutputFile(otuCountsFileName, otuCountsFile);
3210 string bases = flowOrder;
3212 for(int i=0;i<numOTUs;i++){
3214 if (m->control_pressed) {
3217 //output the translated version of the centroid sequence for the otu
3218 if(otuCounts[i] > 0){
3219 int index = centroids[i];
3221 otuCountsFile << "ideal\t";
3222 for(int j=8;j<numFlowCells;j++){
3223 char base = bases[j % 4];
3224 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3225 otuCountsFile << base;
3228 otuCountsFile << endl;
3230 for(int j=0;j<nSeqsPerOTU[i];j++){
3231 int sequence = aaI[i][j];
3232 otuCountsFile << seqNameVector[sequence] << '\t';
3236 for(int k=0;k<lengths[sequence];k++){
3237 char base = bases[k % 4];
3238 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3240 for(int s=0;s<freq;s++){
3242 //otuCountsFile << base;
3245 otuCountsFile << newSeq.substr(4) << endl;
3247 otuCountsFile << endl;
3250 otuCountsFile.close();
3251 outputNames.push_back(otuCountsFileName);
3254 catch(exception& e) {
3255 m->errorOut(e, "ShhherCommand", "writeClusters");
3260 /**************************************************************************************************/
3262 void ShhherCommand::getSingleLookUp(){
3264 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
3265 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3268 ifstream lookUpFile;
3269 m->openInputFile(lookupFileName, lookUpFile);
3271 for(int i=0;i<HOMOPS;i++){
3273 if (m->control_pressed) { break; }
3276 lookUpFile >> logFracFreq;
3278 for(int j=0;j<NUMBINS;j++) {
3279 lookUpFile >> singleLookUp[index];
3285 catch(exception& e) {
3286 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3291 /**************************************************************************************************/
3293 void ShhherCommand::getJointLookUp(){
3296 // the most likely joint probability (-log) that two intenities have the same polymer length
3297 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3299 for(int i=0;i<NUMBINS;i++){
3301 if (m->control_pressed) { break; }
3303 for(int j=0;j<NUMBINS;j++){
3305 double minSum = 100000000;
3307 for(int k=0;k<HOMOPS;k++){
3308 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3310 if(sum < minSum) { minSum = sum; }
3312 jointLookUp[i * NUMBINS + j] = minSum;
3316 catch(exception& e) {
3317 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3322 /**************************************************************************************************/
3324 double ShhherCommand::getProbIntensity(int intIntensity){
3327 double minNegLogProb = 100000000;
3330 for(int i=0;i<HOMOPS;i++){//loop signal strength
3332 if (m->control_pressed) { break; }
3334 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3335 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
3338 return minNegLogProb;
3340 catch(exception& e) {
3341 m->errorOut(e, "ShhherCommand", "getProbIntensity");