5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "shhhercommand.h"
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27 vector<string> myArray;
28 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
32 m->errorOut(e, "ShhherCommand", "setParameters");
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){
39 string helpString = "";
40 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
44 m->errorOut(e, "ShhherCommand", "getHelpString");
48 //**********************************************************************************************************************
50 ShhherCommand::ShhherCommand(){
52 abort = true; calledHelp = true;
55 //initialize outputTypes
56 // vector<string> tempOutNames;
57 // outputTypes["pn.dist"] = tempOutNames;
61 m->errorOut(e, "ShhherCommand", "ShhherCommand");
66 //**********************************************************************************************************************
68 ShhherCommand::ShhherCommand(string option) {
72 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
77 abort = false; calledHelp = false;
79 //allow user to run help
80 if(option == "help") { help(); abort = true; calledHelp = true; }
81 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
84 vector<string> myArray = setParameters();
86 OptionParser parser(option);
87 map<string,string> parameters = parser.getParameters();
89 ValidParameters validParameter;
90 map<string,string>::iterator it;
92 //check to make sure all parameters are valid for command
93 for (it = parameters.begin(); it != parameters.end(); it++) {
94 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
97 //initialize outputTypes
98 vector<string> tempOutNames;
99 // outputTypes["pn.dist"] = tempOutNames;
100 // outputTypes["fasta"] = tempOutNames;
102 //if the user changes the input directory command factory will send this info to us in the output parameter
103 string inputDir = validParameter.validFile(parameters, "inputdir", false);
104 if (inputDir == "not found"){ inputDir = ""; }
107 it = parameters.find("flow");
108 //user has given a template file
109 if(it != parameters.end()){
110 path = m->hasPath(it->second);
111 //if the user has not given a path then, add inputdir. else leave path alone.
112 if (path == "") { parameters["flow"] = inputDir + it->second; }
115 it = parameters.find("lookup");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["lookup"] = inputDir + it->second; }
123 it = parameters.find("file");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["file"] = inputDir + it->second; }
133 //check for required parameters
134 flowFileName = validParameter.validFile(parameters, "flow", true);
135 flowFilesFileName = validParameter.validFile(parameters, "file", true);
136 if (flowFileName == "not found" && flowFilesFileName == "not found") {
137 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
138 m->mothurOutEndLine();
141 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
143 if(flowFileName != "not found"){
144 compositeFASTAFileName = "";
145 compositeNamesFileName = "";
150 //flow.files = 9 character offset
151 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.fasta";
152 m->openOutputFile(compositeFASTAFileName, temp);
155 compositeNamesFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "shhh.names";
156 m->openOutputFile(compositeNamesFileName, temp);
160 if(flowFilesFileName != "not found"){
163 ifstream flowFilesFile;
164 m->openInputFile(flowFilesFileName, flowFilesFile);
165 while(flowFilesFile){
166 fName = m->getline(flowFilesFile);
168 //test if file is valid
170 int ableToOpen = m->openInputFile(fName, in, "noerror");
172 if (ableToOpen == 1) {
173 if (inputDir != "") { //default path is set
174 string tryPath = inputDir + fName;
175 m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
177 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
183 if (ableToOpen == 1) {
184 if (m->getDefaultPath() != "") { //default path is set
185 string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
186 m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
188 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
194 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
195 if (ableToOpen == 1) {
196 string exepath = m->argv;
197 string tempPath = exepath;
198 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
199 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
201 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
202 m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
204 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
209 if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
210 else { flowFileVector.push_back(fName); }
211 m->gobble(flowFilesFile);
213 flowFilesFile.close();
214 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
217 flowFileVector.push_back(flowFileName);
221 //if the user changes the output directory command factory will send this info to us in the output parameter
222 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
224 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it
228 //check for optional parameter and set defaults
229 // ...at some point should added some additional type checking...
231 temp = validParameter.validFile(parameters, "lookup", true);
232 if (temp == "not found") {
233 lookupFileName = "LookUp_Titanium.pat";
237 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
240 //if you can't open it, try input location
241 if (ableToOpen == 1) {
242 if (inputDir != "") { //default path is set
243 string tryPath = inputDir + lookupFileName;
244 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
246 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248 lookupFileName = tryPath;
252 //if you can't open it, try default location
253 if (ableToOpen == 1) {
254 if (m->getDefaultPath() != "") { //default path is set
255 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
256 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
258 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
260 lookupFileName = tryPath;
264 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
265 if (ableToOpen == 1) {
266 string exepath = m->argv;
267 string tempPath = exepath;
268 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
269 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
271 string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
272 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
274 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
276 lookupFileName = tryPath;
279 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
281 else if(temp == "not open") {
283 lookupFileName = validParameter.validFile(parameters, "lookup", false);
285 //if you can't open it its not inputDir, try mothur excutable location
286 string exepath = m->argv;
287 string tempPath = exepath;
288 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
289 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
291 string tryPath = m->getFullPathName(exepath) + lookupFileName;
292 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
294 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
296 lookupFileName = tryPath;
298 if (ableToOpen == 1) { m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true; }
299 }else { lookupFileName = temp; }
301 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
302 m->setProcessors(temp);
303 m->mothurConvert(temp, processors);
305 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0.01"; }
306 m->mothurConvert(temp, cutoff);
308 temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){ temp = "0.000001"; }
309 m->mothurConvert(temp, minDelta);
311 temp = validParameter.validFile(parameters, "maxiter", false); if (temp == "not found"){ temp = "1000"; }
312 m->mothurConvert(temp, maxIters);
314 temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found") { temp = "60"; }
315 m->mothurConvert(temp, sigma);
317 flowOrder = validParameter.validFile(parameters, "order", false);
318 if (flowOrder == "not found"){ flowOrder = "TACG"; }
319 else if(flowOrder.length() != 4){
320 m->mothurOut("The value of the order option must be four bases long\n");
328 catch(exception& e) {
329 m->errorOut(e, "ShhherCommand", "ShhherCommand");
333 //**********************************************************************************************************************
335 int ShhherCommand::execute(){
337 if (abort == true) { if (calledHelp) { return 0; } return 2; }
344 for(int i=1;i<ncpus;i++){
345 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
347 if(abort == 1){ return 0; }
351 m->mothurOut("\nGetting preliminary data...\n");
352 getSingleLookUp(); if (m->control_pressed) { return 0; }
353 getJointLookUp(); if (m->control_pressed) { return 0; }
355 vector<string> flowFileVector;
356 if(flowFilesFileName != "not found"){
359 ifstream flowFilesFile;
360 m->openInputFile(flowFilesFileName, flowFilesFile);
361 while(flowFilesFile){
362 fName = m->getline(flowFilesFile);
363 flowFileVector.push_back(fName);
364 m->gobble(flowFilesFile);
368 flowFileVector.push_back(flowFileName);
371 int numFiles = flowFileVector.size();
373 for(int i=1;i<ncpus;i++){
374 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
377 for(int i=0;i<numFiles;i++){
379 if (m->control_pressed) { break; }
381 double begClock = clock();
382 unsigned long long begTime = time(NULL);
384 flowFileName = flowFileVector[i];
386 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
387 m->mothurOut("Reading flowgrams...\n");
390 if (m->control_pressed) { break; }
392 m->mothurOut("Identifying unique flowgrams...\n");
395 if (m->control_pressed) { break; }
397 m->mothurOut("Calculating distances between flowgrams...\n");
399 strcpy(fileName, flowFileName.c_str());
401 for(int i=1;i<ncpus;i++){
402 MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
404 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
405 MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
406 MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
407 MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
408 MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
409 MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
410 MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
411 MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
412 MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
413 MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
416 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
418 if (m->control_pressed) { break; }
421 for(int i=1;i<ncpus;i++){
422 MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
424 m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
425 m->mothurRemove((distFileName + ".temp." + toString(i)));
428 string namesFileName = createNamesFile();
430 if (m->control_pressed) { break; }
432 m->mothurOut("\nClustering flowgrams...\n");
433 string listFileName = cluster(distFileName, namesFileName);
435 if (m->control_pressed) { break; }
439 getOTUData(listFileName);
441 m->mothurRemove(distFileName);
442 m->mothurRemove(namesFileName);
443 m->mothurRemove(listFileName);
445 if (m->control_pressed) { break; }
449 if (m->control_pressed) { break; }
452 for(int i=1;i<ncpus;i++){
453 MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454 MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
455 MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
456 MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
459 if (m->control_pressed) { break; }
464 int numOTUsOnCPU = numOTUs / ncpus;
465 int numSeqsOnCPU = numSeqs / ncpus;
466 m->mothurOut("\nDenoising flowgrams...\n");
467 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
469 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
471 double cycClock = clock();
472 unsigned long long cycTime = time(NULL);
475 if (m->control_pressed) { break; }
477 int total = singleTau.size();
478 for(int i=1;i<ncpus;i++){
479 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
480 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
481 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
483 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
484 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
485 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
486 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
487 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
490 calcCentroidsDriver(0, numOTUsOnCPU);
492 for(int i=1;i<ncpus;i++){
493 int otuStart = i * numOTUs / ncpus;
494 int otuStop = (i + 1) * numOTUs / ncpus;
496 vector<int> tempCentroids(numOTUs, 0);
497 vector<short> tempChange(numOTUs, 0);
499 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
500 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
502 for(int j=otuStart;j<otuStop;j++){
503 centroids[j] = tempCentroids[j];
504 change[j] = tempChange[j];
508 maxDelta = getNewWeights(); if (m->control_pressed) { break; }
509 double nLL = getLikelihood(); if (m->control_pressed) { break; }
510 checkCentroids(); if (m->control_pressed) { break; }
512 for(int i=1;i<ncpus;i++){
513 MPI_Send(¢roids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
514 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
515 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
518 calcNewDistancesParent(0, numSeqsOnCPU);
520 total = singleTau.size();
522 for(int i=1;i<ncpus;i++){
524 int seqStart = i * numSeqs / ncpus;
525 int seqStop = (i + 1) * numSeqs / ncpus;
527 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
529 vector<int> childSeqIndex(childTotal, 0);
530 vector<double> childSingleTau(childTotal, 0);
531 vector<double> childDist(numSeqs * numOTUs, 0);
532 vector<int> otuIndex(childTotal, 0);
534 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
535 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
536 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
537 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
539 int oldTotal = total;
541 singleTau.resize(total, 0);
542 seqIndex.resize(total, 0);
543 seqNumber.resize(total, 0);
547 for(int j=oldTotal;j<total;j++){
548 int otuI = otuIndex[childIndex];
549 int seqI = childSeqIndex[childIndex];
551 singleTau[j] = childSingleTau[childIndex];
553 aaP[otuI][nSeqsPerOTU[otuI]] = j;
554 aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
559 int index = seqStart * numOTUs;
560 for(int j=seqStart;j<seqStop;j++){
561 for(int k=0;k<numOTUs;k++){
562 dist[index] = childDist[index];
570 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
572 if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
574 for(int i=1;i<ncpus;i++){
575 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
580 for(int i=1;i<ncpus;i++){
581 MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
587 if (m->control_pressed) { break; }
589 m->mothurOut("\nFinalizing...\n");
592 if (m->control_pressed) { break; }
596 vector<int> otuCounts(numOTUs, 0);
597 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
598 calcCentroidsDriver(0, numOTUs);
600 if (m->control_pressed) { break; }
602 writeQualities(otuCounts); if (m->control_pressed) { break; }
603 writeSequences(otuCounts); if (m->control_pressed) { break; }
604 writeNames(otuCounts); if (m->control_pressed) { break; }
605 writeClusters(otuCounts); if (m->control_pressed) { break; }
606 writeGroups(); if (m->control_pressed) { break; }
609 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
615 MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
616 if(abort){ return 0; }
619 MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
621 for(int i=0;i<numFiles;i++){
623 if (m->control_pressed) { break; }
625 //Now into the pyrodist part
629 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
630 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
631 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
632 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
634 flowDataIntI.resize(numSeqs * numFlowCells);
635 flowDataPrI.resize(numSeqs * numFlowCells);
636 mapUniqueToSeq.resize(numSeqs);
637 mapSeqToUnique.resize(numSeqs);
638 lengths.resize(numSeqs);
639 jointLookUp.resize(NUMBINS * NUMBINS);
641 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
642 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
643 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
644 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
645 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
646 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
647 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
649 flowFileName = string(fileName);
650 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
651 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
653 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
655 if (m->control_pressed) { break; }
658 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
660 //Now into the pyronoise part
661 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
663 singleLookUp.resize(HOMOPS * NUMBINS);
664 uniqueFlowgrams.resize(numUniques * numFlowCells);
665 weight.resize(numOTUs);
666 centroids.resize(numOTUs);
667 change.resize(numOTUs);
668 dist.assign(numOTUs * numSeqs, 0);
669 nSeqsPerOTU.resize(numOTUs);
670 cumNumSeqs.resize(numOTUs);
672 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
673 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
674 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
676 int startOTU = pid * numOTUs / ncpus;
677 int endOTU = (pid + 1) * numOTUs / ncpus;
679 int startSeq = pid * numSeqs / ncpus;
680 int endSeq = (pid + 1) * numSeqs /ncpus;
686 if (m->control_pressed) { break; }
688 MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
689 singleTau.assign(total, 0.0000);
690 seqNumber.assign(total, 0);
691 seqIndex.assign(total, 0);
693 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
694 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
695 MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
696 MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
697 MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698 MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699 MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
701 calcCentroidsDriver(startOTU, endOTU);
703 MPI_Send(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
704 MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
706 MPI_Recv(¢roids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707 MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
708 MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
710 vector<int> otuIndex(total, 0);
711 calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
712 total = otuIndex.size();
714 MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
715 MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
716 MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
717 MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
718 MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
720 MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
725 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
727 MPI_Barrier(MPI_COMM_WORLD);
730 if(compositeFASTAFileName != ""){
731 outputNames.push_back(compositeFASTAFileName);
732 outputNames.push_back(compositeNamesFileName);
735 m->mothurOutEndLine();
736 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
737 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
738 m->mothurOutEndLine();
743 catch(exception& e) {
744 m->errorOut(e, "ShhherCommand", "execute");
748 /**************************************************************************************************/
749 string ShhherCommand::createNamesFile(){
752 vector<string> duplicateNames(numUniques, "");
753 for(int i=0;i<numSeqs;i++){
754 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
757 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
760 m->openOutputFile(nameFileName, nameFile);
762 for(int i=0;i<numUniques;i++){
764 if (m->control_pressed) { break; }
766 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
767 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
773 catch(exception& e) {
774 m->errorOut(e, "ShhherCommand", "createNamesFile");
778 /**************************************************************************************************/
780 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
782 ostringstream outStream;
783 outStream.setf(ios::fixed, ios::floatfield);
784 outStream.setf(ios::dec, ios::basefield);
785 outStream.setf(ios::showpoint);
786 outStream.precision(6);
788 int begTime = time(NULL);
789 double begClock = clock();
791 for(int i=startSeq;i<stopSeq;i++){
793 if (m->control_pressed) { break; }
795 for(int j=0;j<i;j++){
796 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
798 if(flowDistance < 1e-6){
799 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
801 else if(flowDistance <= cutoff){
802 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
806 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
810 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
811 if(pid != 0){ fDistFileName += ".temp." + toString(pid); }
813 if (m->control_pressed) { return fDistFileName; }
815 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
817 ofstream distFile(fDistFileName.c_str());
818 distFile << outStream.str();
821 return fDistFileName;
823 catch(exception& e) {
824 m->errorOut(e, "ShhherCommand", "flowDistMPI");
828 /**************************************************************************************************/
830 void ShhherCommand::getOTUData(string listFileName){
834 m->openInputFile(listFileName, listFile);
837 listFile >> label >> numOTUs;
839 otuData.assign(numSeqs, 0);
840 cumNumSeqs.assign(numOTUs, 0);
841 nSeqsPerOTU.assign(numOTUs, 0);
842 aaP.clear();aaP.resize(numOTUs);
848 string singleOTU = "";
850 for(int i=0;i<numOTUs;i++){
852 if (m->control_pressed) { break; }
854 listFile >> singleOTU;
856 istringstream otuString(singleOTU);
862 for(int j=0;j<singleOTU.length();j++){
863 char letter = otuString.get();
869 map<string,int>::iterator nmIt = nameMap.find(seqName);
870 int index = nmIt->second;
876 aaP[i].push_back(index);
881 map<string,int>::iterator nmIt = nameMap.find(seqName);
883 int index = nmIt->second;
888 aaP[i].push_back(index);
893 sort(aaP[i].begin(), aaP[i].end());
894 for(int j=0;j<nSeqsPerOTU[i];j++){
895 seqNumber.push_back(aaP[i][j]);
897 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
904 for(int i=1;i<numOTUs;i++){
905 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
908 seqIndex = seqNumber;
913 catch(exception& e) {
914 m->errorOut(e, "ShhherCommand", "getOTUData");
919 /**************************************************************************************************/
921 void ShhherCommand::initPyroCluster(){
923 if (numOTUs < processors) { processors = 1; }
925 dist.assign(numSeqs * numOTUs, 0);
926 change.assign(numOTUs, 1);
927 centroids.assign(numOTUs, -1);
928 weight.assign(numOTUs, 0);
929 singleTau.assign(numSeqs, 1.0);
931 nSeqsBreaks.assign(processors+1, 0);
932 nOTUsBreaks.assign(processors+1, 0);
935 for(int i=0;i<processors;i++){
936 nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
937 nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
939 nSeqsBreaks[processors] = numSeqs;
940 nOTUsBreaks[processors] = numOTUs;
942 catch(exception& e) {
943 m->errorOut(e, "ShhherCommand", "initPyroCluster");
948 /**************************************************************************************************/
950 void ShhherCommand::fill(){
953 for(int i=0;i<numOTUs;i++){
955 if (m->control_pressed) { break; }
957 cumNumSeqs[i] = index;
958 for(int j=0;j<nSeqsPerOTU[i];j++){
959 seqNumber[index] = aaP[i][j];
960 seqIndex[index] = aaI[i][j];
966 catch(exception& e) {
967 m->errorOut(e, "ShhherCommand", "fill");
972 /**************************************************************************************************/
974 void ShhherCommand::getFlowData(){
977 m->openInputFile(flowFileName, flowFile);
980 seqNameVector.clear();
982 flowDataIntI.clear();
986 int currentNumFlowCells;
990 flowFile >> numFlowCells;
991 int index = 0;//pcluster
992 while(!flowFile.eof()){
994 if (m->control_pressed) { break; }
996 flowFile >> seqName >> currentNumFlowCells;
997 lengths.push_back(currentNumFlowCells);
999 seqNameVector.push_back(seqName);
1000 nameMap[seqName] = index++;//pcluster
1002 for(int i=0;i<numFlowCells;i++){
1003 flowFile >> intensity;
1004 if(intensity > 9.99) { intensity = 9.99; }
1005 int intI = int(100 * intensity + 0.0001);
1006 flowDataIntI.push_back(intI);
1008 m->gobble(flowFile);
1012 numSeqs = seqNameVector.size();
1014 for(int i=0;i<numSeqs;i++){
1016 if (m->control_pressed) { break; }
1018 int iNumFlowCells = i * numFlowCells;
1019 for(int j=lengths[i];j<numFlowCells;j++){
1020 flowDataIntI[iNumFlowCells + j] = 0;
1025 catch(exception& e) {
1026 m->errorOut(e, "ShhherCommand", "getFlowData");
1030 /**************************************************************************************************/
1031 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1034 vector<double> newTau(numOTUs,0);
1035 vector<double> norms(numSeqs, 0);
1040 for(int i=startSeq;i<stopSeq;i++){
1042 if (m->control_pressed) { break; }
1044 double offset = 1e8;
1045 int indexOffset = i * numOTUs;
1047 for(int j=0;j<numOTUs;j++){
1049 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1050 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1052 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1053 offset = dist[indexOffset + j];
1057 for(int j=0;j<numOTUs;j++){
1058 if(weight[j] > MIN_WEIGHT){
1059 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1060 norms[i] += newTau[j];
1067 for(int j=0;j<numOTUs;j++){
1069 newTau[j] /= norms[i];
1071 if(newTau[j] > MIN_TAU){
1072 otuIndex.push_back(j);
1073 seqIndex.push_back(i);
1074 singleTau.push_back(newTau[j]);
1080 catch(exception& e) {
1081 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1086 /**************************************************************************************************/
1088 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1093 vector<double> newTau(numOTUs,0);
1094 vector<double> norms(numSeqs, 0);
1095 nSeqsPerOTU.assign(numOTUs, 0);
1097 for(int i=startSeq;i<stopSeq;i++){
1099 if (m->control_pressed) { break; }
1101 int indexOffset = i * numOTUs;
1103 double offset = 1e8;
1105 for(int j=0;j<numOTUs;j++){
1107 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1108 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1111 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1112 offset = dist[indexOffset + j];
1116 for(int j=0;j<numOTUs;j++){
1117 if(weight[j] > MIN_WEIGHT){
1118 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1119 norms[i] += newTau[j];
1126 for(int j=0;j<numOTUs;j++){
1127 newTau[j] /= norms[i];
1130 for(int j=0;j<numOTUs;j++){
1131 if(newTau[j] > MIN_TAU){
1133 int oldTotal = total;
1137 singleTau.resize(total, 0);
1138 seqNumber.resize(total, 0);
1139 seqIndex.resize(total, 0);
1141 singleTau[oldTotal] = newTau[j];
1143 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1144 aaI[j][nSeqsPerOTU[j]] = i;
1152 catch(exception& e) {
1153 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1158 /**************************************************************************************************/
1160 void ShhherCommand::setOTUs(){
1163 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1165 for(int i=0;i<numOTUs;i++){
1167 if (m->control_pressed) { break; }
1169 for(int j=0;j<nSeqsPerOTU[i];j++){
1170 int index = cumNumSeqs[i] + j;
1171 double tauValue = singleTau[seqNumber[index]];
1172 int sIndex = seqIndex[index];
1173 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1177 for(int i=0;i<numSeqs;i++){
1178 double maxTau = -1.0000;
1180 for(int j=0;j<numOTUs;j++){
1181 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1182 maxTau = bigTauMatrix[i * numOTUs + j];
1187 otuData[i] = maxOTU;
1190 nSeqsPerOTU.assign(numOTUs, 0);
1192 for(int i=0;i<numSeqs;i++){
1193 int index = otuData[i];
1195 singleTau[i] = 1.0000;
1198 aaP[index][nSeqsPerOTU[index]] = i;
1199 aaI[index][nSeqsPerOTU[index]] = i;
1201 nSeqsPerOTU[index]++;
1205 catch(exception& e) {
1206 m->errorOut(e, "ShhherCommand", "setOTUs");
1211 /**************************************************************************************************/
1213 void ShhherCommand::getUniques(){
1218 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1219 uniqueCount.assign(numSeqs, 0); // anWeights
1220 uniqueLengths.assign(numSeqs, 0);
1221 mapSeqToUnique.assign(numSeqs, -1);
1222 mapUniqueToSeq.assign(numSeqs, -1);
1224 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1226 for(int i=0;i<numSeqs;i++){
1228 if (m->control_pressed) { break; }
1232 vector<short> current(numFlowCells);
1233 for(int j=0;j<numFlowCells;j++){
1234 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1237 for(int j=0;j<numUniques;j++){
1238 int offset = j * numFlowCells;
1242 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
1243 else { shorterLength = uniqueLengths[j]; }
1245 for(int k=0;k<shorterLength;k++){
1246 if(current[k] != uniqueFlowgrams[offset + k]){
1253 mapSeqToUnique[i] = j;
1256 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
1262 if(index == numUniques){
1263 uniqueLengths[numUniques] = lengths[i];
1264 uniqueCount[numUniques] = 1;
1265 mapSeqToUnique[i] = numUniques;//anMap
1266 mapUniqueToSeq[numUniques] = i;//anF
1268 for(int k=0;k<numFlowCells;k++){
1269 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1270 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1276 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1277 uniqueLengths.resize(numUniques);
1279 flowDataPrI.resize(numSeqs * numFlowCells, 0);
1280 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
1282 catch(exception& e) {
1283 m->errorOut(e, "ShhherCommand", "getUniques");
1288 /**************************************************************************************************/
1290 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1292 int minLength = lengths[mapSeqToUnique[seqA]];
1293 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
1295 int ANumFlowCells = seqA * numFlowCells;
1296 int BNumFlowCells = seqB * numFlowCells;
1300 for(int i=0;i<minLength;i++){
1302 if (m->control_pressed) { break; }
1304 int flowAIntI = flowDataIntI[ANumFlowCells + i];
1305 float flowAPrI = flowDataPrI[ANumFlowCells + i];
1307 int flowBIntI = flowDataIntI[BNumFlowCells + i];
1308 float flowBPrI = flowDataPrI[BNumFlowCells + i];
1309 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1312 dist /= (float) minLength;
1315 catch(exception& e) {
1316 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1321 //**********************************************************************************************************************/
1323 string ShhherCommand::cluster(string distFileName, string namesFileName){
1326 ReadMatrix* read = new ReadColumnMatrix(distFileName);
1327 read->setCutoff(cutoff);
1329 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1330 clusterNameMap->readMap();
1331 read->read(clusterNameMap);
1333 ListVector* list = read->getListVector();
1334 SparseMatrix* matrix = read->getMatrix();
1337 delete clusterNameMap;
1339 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1341 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
1342 string tag = cluster->getTag();
1344 double clusterCutoff = cutoff;
1345 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1347 if (m->control_pressed) { break; }
1349 cluster->update(clusterCutoff);
1352 list->setLabel(toString(cutoff));
1354 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1356 m->openOutputFile(listFileName, listFile);
1357 list->print(listFile);
1360 delete matrix; delete cluster; delete rabund; delete list;
1362 return listFileName;
1364 catch(exception& e) {
1365 m->errorOut(e, "ShhherCommand", "cluster");
1370 /**************************************************************************************************/
1372 void ShhherCommand::calcCentroidsDriver(int start, int finish){
1374 //this function gets the most likely homopolymer length at a flow position for a group of sequences
1379 for(int i=start;i<finish;i++){
1381 if (m->control_pressed) { break; }
1385 int minFlowGram = 100000000;
1386 double minFlowValue = 1e8;
1387 change[i] = 0; //FALSE
1389 for(int j=0;j<nSeqsPerOTU[i];j++){
1390 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1393 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1394 vector<double> adF(nSeqsPerOTU[i]);
1395 vector<int> anL(nSeqsPerOTU[i]);
1397 for(int j=0;j<nSeqsPerOTU[i];j++){
1398 int index = cumNumSeqs[i] + j;
1399 int nI = seqIndex[index];
1400 int nIU = mapSeqToUnique[nI];
1403 for(k=0;k<position;k++){
1409 anL[position] = nIU;
1410 adF[position] = 0.0000;
1415 for(int j=0;j<nSeqsPerOTU[i];j++){
1416 int index = cumNumSeqs[i] + j;
1417 int nI = seqIndex[index];
1419 double tauValue = singleTau[seqNumber[index]];
1421 for(int k=0;k<position;k++){
1422 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1423 adF[k] += dist * tauValue;
1427 for(int j=0;j<position;j++){
1428 if(adF[j] < minFlowValue){
1430 minFlowValue = adF[j];
1434 if(centroids[i] != anL[minFlowGram]){
1436 centroids[i] = anL[minFlowGram];
1439 else if(centroids[i] != -1){
1445 catch(exception& e) {
1446 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1451 /**************************************************************************************************/
1453 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1456 int flowAValue = cent * numFlowCells;
1457 int flowBValue = flow * numFlowCells;
1461 for(int i=0;i<length;i++){
1462 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1467 return dist / (double)length;
1469 catch(exception& e) {
1470 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1475 /**************************************************************************************************/
1477 double ShhherCommand::getNewWeights(){
1480 double maxChange = 0;
1482 for(int i=0;i<numOTUs;i++){
1484 if (m->control_pressed) { break; }
1486 double difference = weight[i];
1489 for(int j=0;j<nSeqsPerOTU[i];j++){
1490 int index = cumNumSeqs[i] + j;
1491 double tauValue = singleTau[seqNumber[index]];
1492 weight[i] += tauValue;
1495 difference = fabs(weight[i] - difference);
1496 if(difference > maxChange){ maxChange = difference; }
1500 catch(exception& e) {
1501 m->errorOut(e, "ShhherCommand", "getNewWeights");
1506 /**************************************************************************************************/
1508 double ShhherCommand::getLikelihood(){
1512 vector<long double> P(numSeqs, 0);
1515 for(int i=0;i<numOTUs;i++){
1516 if(weight[i] > MIN_WEIGHT){
1522 for(int i=0;i<numOTUs;i++){
1524 if (m->control_pressed) { break; }
1526 for(int j=0;j<nSeqsPerOTU[i];j++){
1527 int index = cumNumSeqs[i] + j;
1528 int nI = seqIndex[index];
1529 double singleDist = dist[seqNumber[index]];
1531 P[nI] += weight[i] * exp(-singleDist * sigma);
1535 for(int i=0;i<numSeqs;i++){
1536 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1541 nLL = nLL -(double)numSeqs * log(sigma);
1545 catch(exception& e) {
1546 m->errorOut(e, "ShhherCommand", "getNewWeights");
1551 /**************************************************************************************************/
1553 void ShhherCommand::checkCentroids(){
1555 vector<int> unique(numOTUs, 1);
1557 for(int i=0;i<numOTUs;i++){
1558 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1563 for(int i=0;i<numOTUs;i++){
1565 if (m->control_pressed) { break; }
1568 for(int j=i+1;j<numOTUs;j++){
1571 if(centroids[j] == centroids[i]){
1575 weight[i] += weight[j];
1583 catch(exception& e) {
1584 m->errorOut(e, "ShhherCommand", "checkCentroids");
1588 /**************************************************************************************************/
1592 void ShhherCommand::writeQualities(vector<int> otuCounts){
1595 string thisOutputDir = outputDir;
1596 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1597 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1599 ofstream qualityFile;
1600 m->openOutputFile(qualityFileName, qualityFile);
1602 qualityFile.setf(ios::fixed, ios::floatfield);
1603 qualityFile.setf(ios::showpoint);
1604 qualityFile << setprecision(6);
1606 vector<vector<int> > qualities(numOTUs);
1607 vector<double> pr(HOMOPS, 0);
1610 for(int i=0;i<numOTUs;i++){
1612 if (m->control_pressed) { break; }
1617 if(nSeqsPerOTU[i] > 0){
1618 qualities[i].assign(1024, -1);
1620 while(index < numFlowCells){
1621 double maxPrValue = 1e8;
1622 short maxPrIndex = -1;
1623 double count = 0.0000;
1625 pr.assign(HOMOPS, 0);
1627 for(int j=0;j<nSeqsPerOTU[i];j++){
1628 int lIndex = cumNumSeqs[i] + j;
1629 double tauValue = singleTau[seqNumber[lIndex]];
1630 int sequenceIndex = aaI[i][j];
1631 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1635 for(int s=0;s<HOMOPS;s++){
1636 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1640 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1641 maxPrValue = pr[maxPrIndex];
1643 if(count > MIN_COUNT){
1645 double norm = 0.0000;
1647 for(int s=0;s<HOMOPS;s++){
1648 norm += exp(-(pr[s] - maxPrValue));
1651 for(int s=1;s<=maxPrIndex;s++){
1653 double temp = 0.0000;
1655 U += exp(-(pr[s-1]-maxPrValue))/norm;
1663 temp = floor(-10 * temp);
1664 value = (int)floor(temp);
1665 if(value > 100){ value = 100; }
1667 qualities[i][base] = (int)value;
1677 if(otuCounts[i] > 0){
1678 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1680 int j=4; //need to get past the first four bases
1681 while(qualities[i][j] != -1){
1682 qualityFile << qualities[i][j] << ' ';
1685 qualityFile << endl;
1688 qualityFile.close();
1689 outputNames.push_back(qualityFileName);
1692 catch(exception& e) {
1693 m->errorOut(e, "ShhherCommand", "writeQualities");
1698 /**************************************************************************************************/
1700 void ShhherCommand::writeSequences(vector<int> otuCounts){
1702 string thisOutputDir = outputDir;
1703 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1704 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1706 m->openOutputFile(fastaFileName, fastaFile);
1708 vector<string> names(numOTUs, "");
1710 for(int i=0;i<numOTUs;i++){
1712 if (m->control_pressed) { break; }
1714 int index = centroids[i];
1716 if(otuCounts[i] > 0){
1717 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1721 for(int j=0;j<numFlowCells;j++){
1723 char base = flowOrder[j % 4];
1724 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1729 fastaFile << newSeq.substr(4) << endl;
1734 outputNames.push_back(fastaFileName);
1736 if(compositeFASTAFileName != ""){
1737 m->appendFiles(fastaFileName, compositeFASTAFileName);
1740 catch(exception& e) {
1741 m->errorOut(e, "ShhherCommand", "writeSequences");
1746 /**************************************************************************************************/
1748 void ShhherCommand::writeNames(vector<int> otuCounts){
1750 string thisOutputDir = outputDir;
1751 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1752 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1754 m->openOutputFile(nameFileName, nameFile);
1756 for(int i=0;i<numOTUs;i++){
1758 if (m->control_pressed) { break; }
1760 if(otuCounts[i] > 0){
1761 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1763 for(int j=1;j<nSeqsPerOTU[i];j++){
1764 nameFile << ',' << seqNameVector[aaI[i][j]];
1771 outputNames.push_back(nameFileName);
1774 if(compositeNamesFileName != ""){
1775 m->appendFiles(nameFileName, compositeNamesFileName);
1778 catch(exception& e) {
1779 m->errorOut(e, "ShhherCommand", "writeNames");
1784 /**************************************************************************************************/
1786 void ShhherCommand::writeGroups(){
1788 string thisOutputDir = outputDir;
1789 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1790 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1791 string groupFileName = fileRoot + "shhh.groups";
1793 m->openOutputFile(groupFileName, groupFile);
1795 for(int i=0;i<numSeqs;i++){
1796 if (m->control_pressed) { break; }
1797 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1800 outputNames.push_back(groupFileName);
1803 catch(exception& e) {
1804 m->errorOut(e, "ShhherCommand", "writeGroups");
1809 /**************************************************************************************************/
1811 void ShhherCommand::writeClusters(vector<int> otuCounts){
1813 string thisOutputDir = outputDir;
1814 if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
1815 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1816 ofstream otuCountsFile;
1817 m->openOutputFile(otuCountsFileName, otuCountsFile);
1819 string bases = flowOrder;
1821 for(int i=0;i<numOTUs;i++){
1823 if (m->control_pressed) {
1826 //output the translated version of the centroid sequence for the otu
1827 if(otuCounts[i] > 0){
1828 int index = centroids[i];
1830 otuCountsFile << "ideal\t";
1831 for(int j=8;j<numFlowCells;j++){
1832 char base = bases[j % 4];
1833 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1834 otuCountsFile << base;
1837 otuCountsFile << endl;
1839 for(int j=0;j<nSeqsPerOTU[i];j++){
1840 int sequence = aaI[i][j];
1841 otuCountsFile << seqNameVector[sequence] << '\t';
1845 for(int k=0;k<lengths[sequence];k++){
1846 char base = bases[k % 4];
1847 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1849 for(int s=0;s<freq;s++){
1851 //otuCountsFile << base;
1854 otuCountsFile << newSeq.substr(4) << endl;
1856 otuCountsFile << endl;
1859 otuCountsFile.close();
1860 outputNames.push_back(otuCountsFileName);
1863 catch(exception& e) {
1864 m->errorOut(e, "ShhherCommand", "writeClusters");
1870 //**********************************************************************************************************************
1872 int ShhherCommand::execute(){
1874 if (abort == true) { return 0; }
1876 getSingleLookUp(); if (m->control_pressed) { return 0; }
1877 getJointLookUp(); if (m->control_pressed) { return 0; }
1879 int numFiles = flowFileVector.size();
1881 if (numFiles < processors) { processors = numFiles; }
1883 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1884 if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1885 else { createProcesses(flowFileVector); } //each processor processes one file
1887 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1890 if(compositeFASTAFileName != ""){
1891 outputNames.push_back(compositeFASTAFileName);
1892 outputNames.push_back(compositeNamesFileName);
1895 m->mothurOutEndLine();
1896 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1897 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
1898 m->mothurOutEndLine();
1902 catch(exception& e) {
1903 m->errorOut(e, "ShhherCommand", "execute");
1908 /**************************************************************************************************/
1910 int ShhherCommand::createProcesses(vector<string> filenames){
1912 vector<int> processIDS;
1917 if (filenames.size() < processors) { processors = filenames.size(); }
1919 //divide the groups between the processors
1920 vector<linePair> lines;
1921 int numFilesPerProcessor = filenames.size() / processors;
1922 for (int i = 0; i < processors; i++) {
1923 int startIndex = i * numFilesPerProcessor;
1924 int endIndex = (i+1) * numFilesPerProcessor;
1925 if(i == (processors - 1)){ endIndex = filenames.size(); }
1926 lines.push_back(linePair(startIndex, endIndex));
1929 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1931 //loop through and create all the processes you want
1932 while (process != processors) {
1936 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
1938 }else if (pid == 0){
1939 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1942 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
1943 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1949 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1951 //force parent to wait until all the processes are done
1952 for (int i=0;i<processIDS.size();i++) {
1953 int temp = processIDS[i];
1959 //////////////////////////////////////////////////////////////////////////////////////////////////////
1961 /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1963 //////////////////////////////////////////////////////////////////////////////////////////////////////
1964 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
1965 //Above fork() will clone, so memory is separate, but that's not the case with windows,
1966 //////////////////////////////////////////////////////////////////////////////////////////////////////
1968 vector<shhhFlowsData*> pDataArray;
1969 DWORD dwThreadIdArray[processors-1];
1970 HANDLE hThreadArray[processors-1];
1972 //Create processor worker threads.
1973 for( int i=0; i<processors-1; i++ ){
1974 // Allocate memory for thread data.
1975 string extension = "";
1976 if (i != 0) { extension = toString(i) + ".temp"; }
1978 shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1979 pDataArray.push_back(tempFlow);
1980 processIDS.push_back(i);
1982 hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
1985 //using the main process as a worker saves time and memory
1987 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1989 //Wait until all threads have terminated.
1990 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1992 //Close all thread handles and free memory allocations.
1993 for(int i=0; i < pDataArray.size(); i++){
1994 for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1995 CloseHandle(hThreadArray[i]);
1996 delete pDataArray[i];
2001 for (int i=0;i<processIDS.size();i++) {
2002 if (compositeFASTAFileName != "") {
2003 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2004 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2005 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2006 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2013 catch(exception& e) {
2014 m->errorOut(e, "ShhherCommand", "createProcesses");
2018 /**************************************************************************************************/
2020 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2023 for(int i=start;i<end;i++){
2025 if (m->control_pressed) { break; }
2027 string flowFileName = filenames[i];
2029 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2030 m->mothurOut("Reading flowgrams...\n");
2032 vector<string> seqNameVector;
2033 vector<int> lengths;
2034 vector<short> flowDataIntI;
2035 vector<double> flowDataPrI;
2036 map<string, int> nameMap;
2037 vector<short> uniqueFlowgrams;
2038 vector<int> uniqueCount;
2039 vector<int> mapSeqToUnique;
2040 vector<int> mapUniqueToSeq;
2041 vector<int> uniqueLengths;
2044 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2046 if (m->control_pressed) { break; }
2048 m->mothurOut("Identifying unique flowgrams...\n");
2049 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2051 if (m->control_pressed) { break; }
2053 m->mothurOut("Calculating distances between flowgrams...\n");
2054 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2055 unsigned long long begTime = time(NULL);
2056 double begClock = clock();
2058 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2060 m->mothurOutEndLine();
2061 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2064 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2065 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2067 if (m->control_pressed) { break; }
2069 m->mothurOut("\nClustering flowgrams...\n");
2070 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2071 cluster(listFileName, distFileName, namesFileName);
2073 if (m->control_pressed) { break; }
2075 vector<int> otuData;
2076 vector<int> cumNumSeqs;
2077 vector<int> nSeqsPerOTU;
2078 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2079 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2080 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
2081 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
2084 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2086 if (m->control_pressed) { break; }
2088 m->mothurRemove(distFileName);
2089 m->mothurRemove(namesFileName);
2090 m->mothurRemove(listFileName);
2092 vector<double> dist; //adDist - distance of sequences to centroids
2093 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
2094 vector<int> centroids; //the representative flowgram for each cluster m
2095 vector<double> weight;
2096 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2097 vector<int> nSeqsBreaks;
2098 vector<int> nOTUsBreaks;
2100 dist.assign(numSeqs * numOTUs, 0);
2101 change.assign(numOTUs, 1);
2102 centroids.assign(numOTUs, -1);
2103 weight.assign(numOTUs, 0);
2104 singleTau.assign(numSeqs, 1.0);
2106 nSeqsBreaks.assign(2, 0);
2107 nOTUsBreaks.assign(2, 0);
2110 nSeqsBreaks[1] = numSeqs;
2111 nOTUsBreaks[1] = numOTUs;
2113 if (m->control_pressed) { break; }
2115 double maxDelta = 0;
2119 begTime = time(NULL);
2121 m->mothurOut("\nDenoising flowgrams...\n");
2122 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2124 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2126 if (m->control_pressed) { break; }
2128 double cycClock = clock();
2129 unsigned long long cycTime = time(NULL);
2130 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2132 if (m->control_pressed) { break; }
2134 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2136 if (m->control_pressed) { break; }
2138 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
2140 if (m->control_pressed) { break; }
2142 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
2144 if (m->control_pressed) { break; }
2146 checkCentroids(numOTUs, centroids, weight);
2148 if (m->control_pressed) { break; }
2150 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2152 if (m->control_pressed) { break; }
2156 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2160 if (m->control_pressed) { break; }
2162 m->mothurOut("\nFinalizing...\n");
2163 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2165 if (m->control_pressed) { break; }
2167 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2169 if (m->control_pressed) { break; }
2171 vector<int> otuCounts(numOTUs, 0);
2172 for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
2174 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2176 if (m->control_pressed) { break; }
2178 writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2179 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2180 writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
2181 writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
2182 writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; }
2184 m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2187 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2191 }catch(exception& e) {
2192 m->errorOut(e, "ShhherCommand", "driver");
2197 /**************************************************************************************************/
2198 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2203 m->openInputFile(filename, flowFile);
2206 int currentNumFlowCells;
2208 thisSeqNameVector.clear();
2209 thisLengths.clear();
2210 thisFlowDataIntI.clear();
2211 thisNameMap.clear();
2213 flowFile >> numFlowCells;
2214 int index = 0;//pcluster
2215 while(!flowFile.eof()){
2217 if (m->control_pressed) { break; }
2219 flowFile >> seqName >> currentNumFlowCells;
2220 thisLengths.push_back(currentNumFlowCells);
2222 thisSeqNameVector.push_back(seqName);
2223 thisNameMap[seqName] = index++;//pcluster
2225 for(int i=0;i<numFlowCells;i++){
2226 flowFile >> intensity;
2227 if(intensity > 9.99) { intensity = 9.99; }
2228 int intI = int(100 * intensity + 0.0001);
2229 thisFlowDataIntI.push_back(intI);
2231 m->gobble(flowFile);
2235 int numSeqs = thisSeqNameVector.size();
2237 for(int i=0;i<numSeqs;i++){
2239 if (m->control_pressed) { break; }
2241 int iNumFlowCells = i * numFlowCells;
2242 for(int j=thisLengths[i];j<numFlowCells;j++){
2243 thisFlowDataIntI[iNumFlowCells + j] = 0;
2250 catch(exception& e) {
2251 m->errorOut(e, "ShhherCommand", "getFlowData");
2255 /**************************************************************************************************/
2257 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2260 ostringstream outStream;
2261 outStream.setf(ios::fixed, ios::floatfield);
2262 outStream.setf(ios::dec, ios::basefield);
2263 outStream.setf(ios::showpoint);
2264 outStream.precision(6);
2266 int begTime = time(NULL);
2267 double begClock = clock();
2269 for(int i=0;i<stopSeq;i++){
2271 if (m->control_pressed) { break; }
2273 for(int j=0;j<i;j++){
2274 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2276 if(flowDistance < 1e-6){
2277 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2279 else if(flowDistance <= cutoff){
2280 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2284 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2285 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2286 m->mothurOutEndLine();
2290 ofstream distFile(distFileName.c_str());
2291 distFile << outStream.str();
2294 if (m->control_pressed) {}
2296 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2297 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2298 m->mothurOutEndLine();
2303 catch(exception& e) {
2304 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2308 /**************************************************************************************************/
2310 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2312 int minLength = lengths[mapSeqToUnique[seqA]];
2313 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
2315 int ANumFlowCells = seqA * numFlowCells;
2316 int BNumFlowCells = seqB * numFlowCells;
2320 for(int i=0;i<minLength;i++){
2322 if (m->control_pressed) { break; }
2324 int flowAIntI = flowDataIntI[ANumFlowCells + i];
2325 float flowAPrI = flowDataPrI[ANumFlowCells + i];
2327 int flowBIntI = flowDataIntI[BNumFlowCells + i];
2328 float flowBPrI = flowDataPrI[BNumFlowCells + i];
2329 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2332 dist /= (float) minLength;
2335 catch(exception& e) {
2336 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2341 /**************************************************************************************************/
2343 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2346 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2347 uniqueCount.assign(numSeqs, 0); // anWeights
2348 uniqueLengths.assign(numSeqs, 0);
2349 mapSeqToUnique.assign(numSeqs, -1);
2350 mapUniqueToSeq.assign(numSeqs, -1);
2352 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2354 for(int i=0;i<numSeqs;i++){
2356 if (m->control_pressed) { break; }
2360 vector<short> current(numFlowCells);
2361 for(int j=0;j<numFlowCells;j++){
2362 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2365 for(int j=0;j<numUniques;j++){
2366 int offset = j * numFlowCells;
2370 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
2371 else { shorterLength = uniqueLengths[j]; }
2373 for(int k=0;k<shorterLength;k++){
2374 if(current[k] != uniqueFlowgrams[offset + k]){
2381 mapSeqToUnique[i] = j;
2384 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
2390 if(index == numUniques){
2391 uniqueLengths[numUniques] = lengths[i];
2392 uniqueCount[numUniques] = 1;
2393 mapSeqToUnique[i] = numUniques;//anMap
2394 mapUniqueToSeq[numUniques] = i;//anF
2396 for(int k=0;k<numFlowCells;k++){
2397 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2398 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2404 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2405 uniqueLengths.resize(numUniques);
2406 uniqueFlowgrams.resize(numFlowCells * numUniques);
2408 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2409 for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
2413 catch(exception& e) {
2414 m->errorOut(e, "ShhherCommand", "getUniques");
2418 /**************************************************************************************************/
2419 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2422 vector<string> duplicateNames(numUniques, "");
2423 for(int i=0;i<numSeqs;i++){
2424 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2428 m->openOutputFile(filename, nameFile);
2430 for(int i=0;i<numUniques;i++){
2432 if (m->control_pressed) { break; }
2434 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2435 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2442 catch(exception& e) {
2443 m->errorOut(e, "ShhherCommand", "createNamesFile");
2447 //**********************************************************************************************************************
2449 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2452 ReadMatrix* read = new ReadColumnMatrix(distFileName);
2453 read->setCutoff(cutoff);
2455 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2456 clusterNameMap->readMap();
2457 read->read(clusterNameMap);
2459 ListVector* list = read->getListVector();
2460 SparseMatrix* matrix = read->getMatrix();
2463 delete clusterNameMap;
2465 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2467 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
2468 string tag = cluster->getTag();
2470 double clusterCutoff = cutoff;
2471 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2473 if (m->control_pressed) { break; }
2475 cluster->update(clusterCutoff);
2478 list->setLabel(toString(cutoff));
2481 m->openOutputFile(filename, listFile);
2482 list->print(listFile);
2485 delete matrix; delete cluster; delete rabund; delete list;
2489 catch(exception& e) {
2490 m->errorOut(e, "ShhherCommand", "cluster");
2494 /**************************************************************************************************/
2496 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
2497 vector<int>& cumNumSeqs,
2498 vector<int>& nSeqsPerOTU,
2499 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
2500 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
2501 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
2502 vector<int>& seqIndex,
2503 map<string, int>& nameMap){
2507 m->openInputFile(fileName, listFile);
2511 listFile >> label >> numOTUs;
2513 otuData.assign(numSeqs, 0);
2514 cumNumSeqs.assign(numOTUs, 0);
2515 nSeqsPerOTU.assign(numOTUs, 0);
2516 aaP.clear();aaP.resize(numOTUs);
2522 string singleOTU = "";
2524 for(int i=0;i<numOTUs;i++){
2526 if (m->control_pressed) { break; }
2528 listFile >> singleOTU;
2530 istringstream otuString(singleOTU);
2534 string seqName = "";
2536 for(int j=0;j<singleOTU.length();j++){
2537 char letter = otuString.get();
2543 map<string,int>::iterator nmIt = nameMap.find(seqName);
2544 int index = nmIt->second;
2546 nameMap.erase(nmIt);
2550 aaP[i].push_back(index);
2555 map<string,int>::iterator nmIt = nameMap.find(seqName);
2557 int index = nmIt->second;
2558 nameMap.erase(nmIt);
2562 aaP[i].push_back(index);
2567 sort(aaP[i].begin(), aaP[i].end());
2568 for(int j=0;j<nSeqsPerOTU[i];j++){
2569 seqNumber.push_back(aaP[i][j]);
2571 for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2572 aaP[i].push_back(0);
2578 for(int i=1;i<numOTUs;i++){
2579 cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2582 seqIndex = seqNumber;
2589 catch(exception& e) {
2590 m->errorOut(e, "ShhherCommand", "getOTUData");
2594 /**************************************************************************************************/
2596 int ShhherCommand::calcCentroidsDriver(int numOTUs,
2597 vector<int>& cumNumSeqs,
2598 vector<int>& nSeqsPerOTU,
2599 vector<int>& seqIndex,
2600 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
2601 vector<int>& centroids, //the representative flowgram for each cluster m
2602 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
2603 vector<int>& mapSeqToUnique,
2604 vector<short>& uniqueFlowgrams,
2605 vector<short>& flowDataIntI,
2606 vector<int>& lengths,
2608 vector<int>& seqNumber){
2610 //this function gets the most likely homopolymer length at a flow position for a group of sequences
2615 for(int i=0;i<numOTUs;i++){
2617 if (m->control_pressed) { break; }
2621 int minFlowGram = 100000000;
2622 double minFlowValue = 1e8;
2623 change[i] = 0; //FALSE
2625 for(int j=0;j<nSeqsPerOTU[i];j++){
2626 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2629 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2630 vector<double> adF(nSeqsPerOTU[i]);
2631 vector<int> anL(nSeqsPerOTU[i]);
2633 for(int j=0;j<nSeqsPerOTU[i];j++){
2634 int index = cumNumSeqs[i] + j;
2635 int nI = seqIndex[index];
2636 int nIU = mapSeqToUnique[nI];
2639 for(k=0;k<position;k++){
2645 anL[position] = nIU;
2646 adF[position] = 0.0000;
2651 for(int j=0;j<nSeqsPerOTU[i];j++){
2652 int index = cumNumSeqs[i] + j;
2653 int nI = seqIndex[index];
2655 double tauValue = singleTau[seqNumber[index]];
2657 for(int k=0;k<position;k++){
2658 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2659 adF[k] += dist * tauValue;
2663 for(int j=0;j<position;j++){
2664 if(adF[j] < minFlowValue){
2666 minFlowValue = adF[j];
2670 if(centroids[i] != anL[minFlowGram]){
2672 centroids[i] = anL[minFlowGram];
2675 else if(centroids[i] != -1){
2683 catch(exception& e) {
2684 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2688 /**************************************************************************************************/
2690 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2691 vector<short>& flowDataIntI, int numFlowCells){
2694 int flowAValue = cent * numFlowCells;
2695 int flowBValue = flow * numFlowCells;
2699 for(int i=0;i<length;i++){
2700 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2705 return dist / (double)length;
2707 catch(exception& e) {
2708 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2712 /**************************************************************************************************/
2714 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2717 double maxChange = 0;
2719 for(int i=0;i<numOTUs;i++){
2721 if (m->control_pressed) { break; }
2723 double difference = weight[i];
2726 for(int j=0;j<nSeqsPerOTU[i];j++){
2727 int index = cumNumSeqs[i] + j;
2728 double tauValue = singleTau[seqNumber[index]];
2729 weight[i] += tauValue;
2732 difference = fabs(weight[i] - difference);
2733 if(difference > maxChange){ maxChange = difference; }
2737 catch(exception& e) {
2738 m->errorOut(e, "ShhherCommand", "getNewWeights");
2743 /**************************************************************************************************/
2745 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2749 vector<long double> P(numSeqs, 0);
2752 for(int i=0;i<numOTUs;i++){
2753 if(weight[i] > MIN_WEIGHT){
2759 for(int i=0;i<numOTUs;i++){
2761 if (m->control_pressed) { break; }
2763 for(int j=0;j<nSeqsPerOTU[i];j++){
2764 int index = cumNumSeqs[i] + j;
2765 int nI = seqIndex[index];
2766 double singleDist = dist[seqNumber[index]];
2768 P[nI] += weight[i] * exp(-singleDist * sigma);
2772 for(int i=0;i<numSeqs;i++){
2773 if(P[i] == 0){ P[i] = DBL_EPSILON; }
2778 nLL = nLL -(double)numSeqs * log(sigma);
2782 catch(exception& e) {
2783 m->errorOut(e, "ShhherCommand", "getNewWeights");
2788 /**************************************************************************************************/
2790 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2792 vector<int> unique(numOTUs, 1);
2794 for(int i=0;i<numOTUs;i++){
2795 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2800 for(int i=0;i<numOTUs;i++){
2802 if (m->control_pressed) { break; }
2805 for(int j=i+1;j<numOTUs;j++){
2808 if(centroids[j] == centroids[i]){
2812 weight[i] += weight[j];
2822 catch(exception& e) {
2823 m->errorOut(e, "ShhherCommand", "checkCentroids");
2827 /**************************************************************************************************/
2829 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
2830 vector<double>& weight, vector<short>& change, vector<int>& centroids,
2831 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
2832 vector<int>& seqNumber, vector<int>& seqIndex,
2833 vector<short>& uniqueFlowgrams,
2834 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2839 vector<double> newTau(numOTUs,0);
2840 vector<double> norms(numSeqs, 0);
2841 nSeqsPerOTU.assign(numOTUs, 0);
2843 for(int i=0;i<numSeqs;i++){
2845 if (m->control_pressed) { break; }
2847 int indexOffset = i * numOTUs;
2849 double offset = 1e8;
2851 for(int j=0;j<numOTUs;j++){
2853 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2854 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2857 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2858 offset = dist[indexOffset + j];
2862 for(int j=0;j<numOTUs;j++){
2863 if(weight[j] > MIN_WEIGHT){
2864 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2865 norms[i] += newTau[j];
2872 for(int j=0;j<numOTUs;j++){
2873 newTau[j] /= norms[i];
2876 for(int j=0;j<numOTUs;j++){
2877 if(newTau[j] > MIN_TAU){
2879 int oldTotal = total;
2883 singleTau.resize(total, 0);
2884 seqNumber.resize(total, 0);
2885 seqIndex.resize(total, 0);
2887 singleTau[oldTotal] = newTau[j];
2889 aaP[j][nSeqsPerOTU[j]] = oldTotal;
2890 aaI[j][nSeqsPerOTU[j]] = i;
2898 catch(exception& e) {
2899 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2903 /**************************************************************************************************/
2905 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2908 for(int i=0;i<numOTUs;i++){
2910 if (m->control_pressed) { return 0; }
2912 cumNumSeqs[i] = index;
2913 for(int j=0;j<nSeqsPerOTU[i];j++){
2914 seqNumber[index] = aaP[i][j];
2915 seqIndex[index] = aaI[i][j];
2923 catch(exception& e) {
2924 m->errorOut(e, "ShhherCommand", "fill");
2928 /**************************************************************************************************/
2930 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2931 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2934 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2936 for(int i=0;i<numOTUs;i++){
2938 if (m->control_pressed) { break; }
2940 for(int j=0;j<nSeqsPerOTU[i];j++){
2941 int index = cumNumSeqs[i] + j;
2942 double tauValue = singleTau[seqNumber[index]];
2943 int sIndex = seqIndex[index];
2944 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
2948 for(int i=0;i<numSeqs;i++){
2949 double maxTau = -1.0000;
2951 for(int j=0;j<numOTUs;j++){
2952 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2953 maxTau = bigTauMatrix[i * numOTUs + j];
2958 otuData[i] = maxOTU;
2961 nSeqsPerOTU.assign(numOTUs, 0);
2963 for(int i=0;i<numSeqs;i++){
2964 int index = otuData[i];
2966 singleTau[i] = 1.0000;
2969 aaP[index][nSeqsPerOTU[index]] = i;
2970 aaI[index][nSeqsPerOTU[index]] = i;
2972 nSeqsPerOTU[index]++;
2975 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2977 catch(exception& e) {
2978 m->errorOut(e, "ShhherCommand", "setOTUs");
2982 /**************************************************************************************************/
2984 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2985 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2986 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2989 string thisOutputDir = outputDir;
2990 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
2991 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2993 ofstream qualityFile;
2994 m->openOutputFile(qualityFileName, qualityFile);
2996 qualityFile.setf(ios::fixed, ios::floatfield);
2997 qualityFile.setf(ios::showpoint);
2998 qualityFile << setprecision(6);
3000 vector<vector<int> > qualities(numOTUs);
3001 vector<double> pr(HOMOPS, 0);
3004 for(int i=0;i<numOTUs;i++){
3006 if (m->control_pressed) { break; }
3011 if(nSeqsPerOTU[i] > 0){
3012 qualities[i].assign(1024, -1);
3014 while(index < numFlowCells){
3015 double maxPrValue = 1e8;
3016 short maxPrIndex = -1;
3017 double count = 0.0000;
3019 pr.assign(HOMOPS, 0);
3021 for(int j=0;j<nSeqsPerOTU[i];j++){
3022 int lIndex = cumNumSeqs[i] + j;
3023 double tauValue = singleTau[seqNumber[lIndex]];
3024 int sequenceIndex = aaI[i][j];
3025 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3029 for(int s=0;s<HOMOPS;s++){
3030 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3034 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3035 maxPrValue = pr[maxPrIndex];
3037 if(count > MIN_COUNT){
3039 double norm = 0.0000;
3041 for(int s=0;s<HOMOPS;s++){
3042 norm += exp(-(pr[s] - maxPrValue));
3045 for(int s=1;s<=maxPrIndex;s++){
3047 double temp = 0.0000;
3049 U += exp(-(pr[s-1]-maxPrValue))/norm;
3057 temp = floor(-10 * temp);
3058 value = (int)floor(temp);
3059 if(value > 100){ value = 100; }
3061 qualities[i][base] = (int)value;
3071 if(otuCounts[i] > 0){
3072 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3074 int j=4; //need to get past the first four bases
3075 while(qualities[i][j] != -1){
3076 qualityFile << qualities[i][j] << ' ';
3077 if (j > qualities[i].size()) { break; }
3080 qualityFile << endl;
3083 qualityFile.close();
3084 outputNames.push_back(qualityFileName);
3087 catch(exception& e) {
3088 m->errorOut(e, "ShhherCommand", "writeQualities");
3093 /**************************************************************************************************/
3095 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3097 string thisOutputDir = outputDir;
3098 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3099 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3101 m->openOutputFile(fastaFileName, fastaFile);
3103 vector<string> names(numOTUs, "");
3105 for(int i=0;i<numOTUs;i++){
3107 if (m->control_pressed) { break; }
3109 int index = centroids[i];
3111 if(otuCounts[i] > 0){
3112 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3116 for(int j=0;j<numFlowCells;j++){
3118 char base = flowOrder[j % 4];
3119 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3124 fastaFile << newSeq.substr(4) << endl;
3129 outputNames.push_back(fastaFileName);
3131 if(thisCompositeFASTAFileName != ""){
3132 m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3135 catch(exception& e) {
3136 m->errorOut(e, "ShhherCommand", "writeSequences");
3141 /**************************************************************************************************/
3143 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3145 string thisOutputDir = outputDir;
3146 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3147 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3149 m->openOutputFile(nameFileName, nameFile);
3151 for(int i=0;i<numOTUs;i++){
3153 if (m->control_pressed) { break; }
3155 if(otuCounts[i] > 0){
3156 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3158 for(int j=1;j<nSeqsPerOTU[i];j++){
3159 nameFile << ',' << seqNameVector[aaI[i][j]];
3166 outputNames.push_back(nameFileName);
3169 if(thisCompositeNamesFileName != ""){
3170 m->appendFiles(nameFileName, thisCompositeNamesFileName);
3173 catch(exception& e) {
3174 m->errorOut(e, "ShhherCommand", "writeNames");
3179 /**************************************************************************************************/
3181 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3183 string thisOutputDir = outputDir;
3184 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3185 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3186 string groupFileName = fileRoot + "shhh.groups";
3188 m->openOutputFile(groupFileName, groupFile);
3190 for(int i=0;i<numSeqs;i++){
3191 if (m->control_pressed) { break; }
3192 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3195 outputNames.push_back(groupFileName);
3198 catch(exception& e) {
3199 m->errorOut(e, "ShhherCommand", "writeGroups");
3204 /**************************************************************************************************/
3206 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3208 string thisOutputDir = outputDir;
3209 if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
3210 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3211 ofstream otuCountsFile;
3212 m->openOutputFile(otuCountsFileName, otuCountsFile);
3214 string bases = flowOrder;
3216 for(int i=0;i<numOTUs;i++){
3218 if (m->control_pressed) {
3221 //output the translated version of the centroid sequence for the otu
3222 if(otuCounts[i] > 0){
3223 int index = centroids[i];
3225 otuCountsFile << "ideal\t";
3226 for(int j=8;j<numFlowCells;j++){
3227 char base = bases[j % 4];
3228 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3229 otuCountsFile << base;
3232 otuCountsFile << endl;
3234 for(int j=0;j<nSeqsPerOTU[i];j++){
3235 int sequence = aaI[i][j];
3236 otuCountsFile << seqNameVector[sequence] << '\t';
3240 for(int k=0;k<lengths[sequence];k++){
3241 char base = bases[k % 4];
3242 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3244 for(int s=0;s<freq;s++){
3246 //otuCountsFile << base;
3249 otuCountsFile << newSeq.substr(4) << endl;
3251 otuCountsFile << endl;
3254 otuCountsFile.close();
3255 outputNames.push_back(otuCountsFileName);
3258 catch(exception& e) {
3259 m->errorOut(e, "ShhherCommand", "writeClusters");
3264 /**************************************************************************************************/
3266 void ShhherCommand::getSingleLookUp(){
3268 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
3269 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3272 ifstream lookUpFile;
3273 m->openInputFile(lookupFileName, lookUpFile);
3275 for(int i=0;i<HOMOPS;i++){
3277 if (m->control_pressed) { break; }
3280 lookUpFile >> logFracFreq;
3282 for(int j=0;j<NUMBINS;j++) {
3283 lookUpFile >> singleLookUp[index];
3289 catch(exception& e) {
3290 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3295 /**************************************************************************************************/
3297 void ShhherCommand::getJointLookUp(){
3300 // the most likely joint probability (-log) that two intenities have the same polymer length
3301 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3303 for(int i=0;i<NUMBINS;i++){
3305 if (m->control_pressed) { break; }
3307 for(int j=0;j<NUMBINS;j++){
3309 double minSum = 100000000;
3311 for(int k=0;k<HOMOPS;k++){
3312 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3314 if(sum < minSum) { minSum = sum; }
3316 jointLookUp[i * NUMBINS + j] = minSum;
3320 catch(exception& e) {
3321 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3326 /**************************************************************************************************/
3328 double ShhherCommand::getProbIntensity(int intIntensity){
3331 double minNegLogProb = 100000000;
3334 for(int i=0;i<HOMOPS;i++){//loop signal strength
3336 if (m->control_pressed) { break; }
3338 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3339 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
3342 return minNegLogProb;
3344 catch(exception& e) {
3345 m->errorOut(e, "ShhherCommand", "getProbIntensity");